def test_pickle(track_abundance): import pickle from io import BytesIO e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) e1.add_sequence(seq) fp = BytesIO() pickle.dump(e1, fp) fp2 = BytesIO(fp.getvalue()) e2 = pickle.load(fp2) assert e1.get_mins(with_abundance=track_abundance) == \ e2.get_mins(with_abundance=track_abundance) assert e1.num == e2.num assert e1.ksize == e2.ksize assert e1.is_protein == e2.is_protein assert e1.max_hash == e2.max_hash assert e1.seed == e2.seed
def __init__(self, query_file, ksize, scaled, catlas_name, debug=True): self.filename = query_file self.ksize = ksize self.kmers = set() self.name = None mh = MinHash(0, ksize, scaled=scaled) self.mh = mh self.catlas_name = catlas_name self.debug = debug notify('----') notify('QUERY FILE: {}', self.filename) # build hashes for all the query k-mers & create signature notify('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) for record in screed.open(self.filename): if self.name is None: self.name = record.name if len(record.sequence) >= int(ksize): self.kmers.update(bf.get_kmer_hashes(record.sequence)) mh.add_sequence(record.sequence, True) self.sig = sourmash.SourmashSignature(mh, name=self.name, filename=self.filename) notify('got {} k-mers from query', len(self.kmers)) self.cdbg_match_counts = {} self.catlas_match_counts = {}
def test_abund_similarity_zero(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) assert E1.similarity(E2) == 0.0
def test_common_1(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) assert E1.count_common(E2) == 4 assert E2.count_common(E1) == 4
def test_diff_seed(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=1) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=2) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) with pytest.raises(ValueError): E1.count_common(E2)
def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): a = MinHash(sketch_size, 10, track_abundance=True) oracle = dict(zip(hashes, abundances)) a.set_abundances(oracle) mins = a.get_mins(with_abundance=True) size = min(sum(1 for v in oracle.values() if v > 0), sketch_size) assert len(mins) == size for k, v in mins.items(): assert oracle[k] == v
def test_jaccard_1(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4, 6]: E2.add_hash(i) # here the union is [1, 2, 3, 4, 5] # and the intesection is [1, 2, 3, 4] => 4/5. assert round(E1.jaccard(E2), 2) == round(4 / 5.0, 2) assert round(E2.jaccard(E1), 2) == round(4 / 5.0, 2)
def test_abund_similarity(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) for i in [1, 2]: E2.add_hash(i) assert round(E1.similarity(E1)) == 1.0 assert round(E1.similarity(E2), 2) == 0.5 assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0 assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5
def test_jaccard_2_difflen(track_abundance): E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance) E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance) for i in [1, 2, 3, 4, 5]: E1.add_hash(i) for i in [1, 2, 3, 4]: E2.add_hash(i) print(E1.jaccard(E2)) assert round(E1.jaccard(E2), 2) == 4 / 5.0 assert round(E2.jaccard(E1), 2) == 4 / 5.0
def test_bad_construct_2(track_abundance): try: e1 = MinHash(n=100, is_protein=False, track_abundance=track_abundance) assert 0, "require ksize in constructor" except TypeError: pass
def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled): a = MinHash(0, 10, track_abundance=True, scaled=scaled) oracle = dict(zip(hashes, abundances)) a.set_abundances(oracle) max_hash = get_max_hash_for_scaled(scaled) below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0) mins = a.get_mins(with_abundance=True) assert len(mins) == below_max_hash for k, v in mins.items(): assert oracle[k] == v assert k <= max_hash assert v > 0
def test_protein_mh(track_abundance): e1 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) for i in range(len(seq) - 5): kmer = seq[i:i + 6] e2.add(kmer) assert e1.get_mins() == e2.get_mins() assert 901193879228338100 in e1.get_mins()
def _signatures(self): "Create a _signatures member dictionary that contains {idx: sigobj}." from sourmash import MinHash, SourmashSignature is_protein = False is_hp = False is_dayhoff = False if self.moltype == 'protein': is_protein = True elif self.moltype == 'hp': is_hp = True elif self.moltype == 'dayhoff': is_dayhoff = True minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled, is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff) debug('creating signatures for LCA DB...') mhd = defaultdict(minhash.copy_and_clear) temp_vals = defaultdict(list) # invert the hashval_to_idx dictionary for (hashval, idlist) in self.hashval_to_idx.items(): for idx in idlist: temp_hashes = temp_vals[idx] temp_hashes.append(hashval) # 50 is an arbitrary number. If you really want # to micro-optimize, list is resized and grow in this pattern: # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57) if len(temp_hashes) > 50: mhd[idx].add_many(temp_hashes) # Sigh, python 2... when it goes away, # we can do `temp_hashes.clear()` instead. del temp_vals[idx] # We loop temp_vals again to add any remainder hashes # (each list of hashes is smaller than 50 items) for sig, vals in temp_vals.items(): mhd[sig].add_many(vals) sigd = {} for idx, mh in mhd.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) return sigd
def test_build_hashCounter(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 2, 5)) true_res = Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1}) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) print("Hash Counter: ", hc) assert hc == true_res
def test_drop_below_mincount_threshold(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 1, 2, 5)) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) kept_hashes = drop_below_mincount(hc, 3) true_kept = Counter({1: 3}) print("kept hashes: ", kept_hashes) assert kept_hashes == true_kept
def sketch(args): cwd = os.getcwd() db_path = os.path.join(cwd, args.name + '.db') # check for the existence of the database and tables if os.path.exists(db_path): pass else: print( "Database file not found. Please make sure the name is correct or run mashpit build." ) exit(0) fasta_folder = os.path.join(cwd, 'fasta') if os.path.exists(fasta_folder): pass else: print("Fasta folder not found.") exit(0) sig_file_name = args.name + '.sig' all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta") genomes_list = glob.glob(all_fasta_path) minhashes = [] for genome in genomes_list: mh = MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) minhashes.append(mh) siglist = [] for i in range(len(minhashes)): signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta') siglist.append(SourmashSignature(minhashes[i], name=signame)) with open(sig_file_name, 'w') as f: save_signatures(siglist, fp=f)
def to_sourmash(self): try: from sourmash import MinHash except ImportError: print( 'Must install python sourmash to convert to sourmash.MinHash', file=sys.stderr) return None sig = MinHash(self.num(), self.ksize(), is_protein=self.is_protein(), dayhoff=self.dayhoff(), hp=self.hp(), track_abundance=self.track_abundance(), seed=self.seed(), mins=self.mins(), max_hash=self.max_hash()) return sig
# remove hashes that occur only once for hashval, ct in counts.copy().items(): print(f"{hashval}:{ct}") if ct < min_count: counts.pop(hashval) # write out hashes # let's try building a sig. we will use this sig later to intersect with sample-specific sigs new_mins = set(counts.keys()) print(len(new_mins)) with open(outhashes, "w") as out: for hsh in new_mins: out.write(str(hsh) + '\n') if len(new_mins) > 0: minhash = MinHash( n=0, ksize=ksize, scaled=scaled ) # scaled=1 so we keep all (though these were previously at some other scaled val) minhash.add_many(set(counts.keys())) # write sig to file sigobj = sourmash.SourmashSignature( minhash, name=f"aggregated_hashvals_above_{min_count}", filename=f"generated with drop_unique_hashes.py") sigobjs += [sigobj] ## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes with open(outsig, 'wt') as sigout: sourmash.save_signatures(sigobjs, sigout) #notify('wrote signature to {}', args.output) # write out hashes to a text file
def main(argv): p = argparse.ArgumentParser(description=__doc__) p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('mh_index_picklefile', help='pickled hashval index') p.add_argument('hashval_list', help='file with list of hashvals') p.add_argument('output') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('--scaled', default=1000, type=float, help="scaled value for contigs minhash output") p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args(argv) # create output directory if it doesn't exist. outdir = args.output notify('putting output in {}', outdir) os.makedirs(os.path.join(outdir, "contigs"), exist_ok=True) if not os.path.isdir(outdir): error("output '{}' is not a directory and cannot be made", outdir) sys.exit(-1) # load picklefile with open(args.mh_index_picklefile, 'rb') as fp: hashval_to_contig_id = pickle.load(fp) notify('loaded {} hash value -> cdbg_id mappings from {}', len(hashval_to_contig_id), args.mh_index_picklefile) # load list of desired hashvals hashvals = [int(x.strip()) for x in open(args.hashval_list, 'rt')] hashvals = set(hashvals) notify('loaded {} search hashvalues from {}', len(hashvals), args.hashval_list) if not len(hashvals): print('No hash values to search!', file=sys.stderr) sys.exit(-1) # load catlas DAG catlas = CAtlas(args.catlas_prefix) notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix) notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg)) # find the contigs filename contigs_file = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # get a single ksize & scaled ksize = int(args.ksize) scaled = int(args.scaled) # record command line with open(os.path.join(outdir, 'command.txt'), 'wt') as fp: fp.write(str(sys.argv)) fp.write("\n") # output results.csv in the output directory: csvoutfp = open(os.path.join(outdir, 'hashval_results.csv'), 'wt') csv_writer = csv.writer(csvoutfp) csv_writer.writerow(['hashval', 'bp', 'contigs']) # iterate over each query, do the thing. n_found = 0 for hashval in hashvals: notify('----') notify('QUERY HASHVAL: {}', hashval) mh = MinHash(0, ksize, scaled=scaled) result = execute_query(hashval, catlas, hashval_to_contig_id, mh=mh) notify('done searching!') if not result: notify("no result for hashval {}", hashval) continue result.retrieve_contigs(contigs_file) result.write(csv_writer, csvoutfp, outdir) assert hashval in mh.get_mins() n_found += 1 # end main loop! notify('----') notify("Done! Found {} hashvals of {} in {} with k={}", n_found, len(hashvals), args.catlas_prefix, ksize) notify("Results are in directory '{}'", outdir) return 0
def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCAG' e1.add_sequence(seq) for i in range(len(seq) - 3): e2.add(seq[i:i + 4]) assert e1.get_mins() == e2.get_mins() print(e1.get_mins()) assert 726311917625663847 in e1.get_mins() assert 3697418565283905118 in e1.get_mins()
def main(): p = argparse.ArgumentParser() p.add_argument('hashfile') # file that contains hashes p.add_argument('-o', '--output', default=None, help='file to output signature to') p.add_argument('-k', '--ksize', default=None, type=int) p.add_argument('--scaled', default=None, type=int) p.add_argument('--num', default=None, type=int) p.add_argument('--name', default='', help='signature name') p.add_argument('--filename', default='', help='filename to add to signature') args = p.parse_args() # check arguments. if args.scaled and args.num: error('cannot specify both --num and --scaled! exiting.') return -1 if not args.ksize: error('must specify --ksize') return -1 if not args.output: error('must specify --output') return -1 # first, load in all the hashes hashes = set() for line in open(args.hashfile, 'rt'): hashval = int(line.strip()) hashes.add(hashval) if not hashes: error("ERROR, no hashes loaded from {}!", args.hashfile) return -1 notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile) # now, create the MinHash object that we'll use. scaled = 0 num = 0 if args.scaled: scaled = args.scaled elif args.num: num = args.num else: notify('setting --num automatically from the number of hashes.') num = len(hashes) # construct empty MinHash object according to args minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled) # add hashes into! minhash.add_many(hashes) if len(minhash) < len(hashes): notify("WARNING: loaded {} hashes, but only {} made it into MinHash.", len(hashes), len(minhash)) if scaled: notify("This is probably because of the scaled argument.") elif args.num: notify("This is probably because your --num is set to {}", args.num) if num > len(minhash): notify("WARNING: --num set to {}, but only {} hashes in signature.", num, len(minhash)) sigobj = sourmash.SourmashSignature(minhash, name=args.name, filename=args.filename) with open(args.output, 'wt') as fp: sourmash.save_signatures([sigobj], fp) notify('wrote signature to {}', args.output)
def fetchneighborhood2(index, features_upstream=0, features_downstream=0): cluster = iaa_positive_df.iloc[index, :] acc = cluster['accession'] assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename']) #make the genome database from the .fa.index file assembly_index_file = 'index_files/' + assembly print(assembly_index_file) db = pd.read_csv(assembly_index_file, sep="!!", header=None, engine='python') #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"] db.columns = [ "filename", "assembly", "accession", "locus_tag", "old_locus_tag", "name", "biosample", "protein_name", "coordinates", "protein_id", "pseudogene", "protein_seq" ] db['direction'] = [ -1 if re.match('complement', c) else 1 for c in db['coordinates'] ] db['start_coord'] = [ re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0) for c in db['coordinates'] ] db['start_coord'] = [ re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord'] ] db['start_coord'] = db['start_coord'].astype(int) db['end_coord'] = [ re.search('(?<=\.(\.|\>))\d+', str(c)).group(0) for c in db['coordinates'] ] db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']] db['end_coord'] = db['end_coord'].astype(int) hit_list = cluster['hit_list'] query_list = cluster['query_list'] cluster_number = cluster['cluster_number'] hit_dict = dict(zip(hit_list, query_list)) genome = db.loc[db['accession'] == acc].copy() start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype( int)[0] - features_upstream stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype( int)[0] + features_downstream neighborhood = genome.loc[start:stop, ].copy() neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict) coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) #function to find GC content of cluster vs genome gbff_str = str(db['filename'][0][1:]) with open("gbff_files_unzipped/" + gbff_str) as file: gbff_file = file.read() genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)", gbff_file)) genome_seq = re.sub('\s|\d|\n', '', genome_seq) Gg = genome_seq.count("g") Gc = genome_seq.count("c") Ga = genome_seq.count("a") Gt = genome_seq.count("t") genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt) start = min(coord_list)[0] end = max(coord_list)[1] regex_str = acc + "[\s+\S+]+\/\/" all_cluster_fasta = re.findall(regex_str, gbff_file)[0] all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)", all_cluster_fasta)[0] all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta) cluster_seq = all_cluster_fasta[start - 1:end - 1] g = cluster_seq.count("g") c = cluster_seq.count("c") a = cluster_seq.count("a") t = cluster_seq.count("t") clusterGC = (g + c) / (g + c + a + t) diffGC = abs(clusterGC - genomeGC) #compare minhash values between cluster and genome kmer_size = 5 n = 0 sc = 1 cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) cluster_minhash.add_sequence(cluster_seq, force=True) cluster_minhash.add_sequence(complement(cluster_seq), force=True) # genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) genome_minhash.add_sequence(genome_seq, force=True) genome_minhash.add_sequence(complement(genome_seq), force=True) minhash_sim = cluster_minhash.similarity(genome_minhash) # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq) # #print(len(genome_seq)-len(genome_minus_cluster)) # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc) # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True) # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True) # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash) #print(minhash_sim) #compare tetranucleotide frequency between cluster and genomes bases = ['a', 't', 'g', 'c'] four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)] four_mer_count_genome = np.add( [genome_seq.count(i) for i in four_mers], [complement(genome_seq).count(i) for i in four_mers]) four_mer_freq_genome = [ i / sum(four_mer_count_genome) for i in four_mer_count_genome ] four_mer_count_cluster = np.add( [cluster_seq.count(i) for i in four_mers], [complement(cluster_seq).count(i) for i in four_mers]) four_mer_freq_cluster = [ i / sum(four_mer_count_cluster) for i in four_mer_count_cluster ] four_mer_distance = scipy.spatial.distance.cityblock( four_mer_freq_cluster, four_mer_freq_genome) #### if sum(neighborhood[neighborhood['query_match'].notnull()] ['direction']) < 0: neighborhood['actual_start_tmp'] = neighborhood['start_coord'] neighborhood['start_coord'] = neighborhood['end_coord'] * -1 neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1 neighborhood['direction'] = neighborhood['direction'] * -1 neighborhood = neighborhood.sort_values(by='start_coord') neighborhood['query_match'] = neighborhood['query_match'].replace( np.nan, "x") nhbrhood_hit_list = list(neighborhood['query_match']) nhbrhood_locus_tags = list(neighborhood['locus_tag']) nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag']) nhbrhood_prot_ids = list(neighborhood['protein_id']) nhbrhood_prot_name = list(neighborhood['protein_name']) nhbrhood_prot_seq = list(neighborhood['protein_seq']) order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else ("〈 " + gene['query_match'] + " |") for index, gene in neighborhood.iterrows()] dist = list( np.array(neighborhood['start_coord'][1:]) - np.array(neighborhood['end_coord'][:-1])) dist = ["-" + str(d) + "-" for d in dist] adj_coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) if min(neighborhood['start_coord']) < 0: tare_value = abs(min(neighborhood['start_coord'])) tared_adj_coord_list = list( zip([v + tare_value for v in neighborhood['start_coord']], [v + tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) else: tare_value = min(neighborhood['start_coord']) tared_adj_coord_list = list( zip([v - tare_value for v in neighborhood['start_coord']], [v - tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) # making an ITOL compatible string gene_color_dict = { 'IaaP': '#ff5969', 'IaaQ': '#2db34e', 'IaaR': '#fb77e0', 'IaaA': '#00bc7e', 'IaaB': '#8d006e', 'IaaC': '#cfdd63', 'IaaD': '#0060d0', 'IaaE': '#bb7b00', 'IaaF': '#7c2c29', 'IaaG': '#f1d17a', 'IaaH': '#37589E', 'IaaI': '#ACC92A', 'IaaJ': '#752AC9', 'IaaK': '#D4B5E6', 'IaaL': '#211E45', 'IaaM': '#BFB3E6', 'x': '#d1d1d1' } max_len = tared_adj_coord_list[-1][1] itol_diagram = [] for g in tared_adj_coord_list: gene_string = [] gene_length = g[1] - g[0] if g[2] > 0: gene_string.append('RE') gene_string.append(str(g[0])) gene_string.append(str(g[1] - (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) gene_string.append(',') gene_string.append('TR') gene_string.append(str(g[1] - (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') else: gene_string.append('TL') gene_string.append(str(g[0])) gene_string.append(str(g[0] + (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') gene_string.append(',') gene_string.append('RE') gene_string.append(str(g[0] + (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) itol_gene = '|'.join(gene_string) itol_diagram.append(itol_gene) itol_diagram_joined = ",".join(map(str, itol_diagram)) itol_diagram_string = str(max_len) + ',' + itol_diagram_joined itol_diagram_string = re.sub(',\|', ',', itol_diagram_string) #obtains "| A 〉-23-| B 〉-23-| C 〉" synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1]) synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist) #obtains "| A 〉| B 〉| C 〉" synteny_dir = ''.join(order) synteny_dir = re.sub("iaa", "", synteny_dir) #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉" #synteny_dir_pident =''.join(order_pident) #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident) #obtains "A-B-C" synteny = re.sub("\n", "-", neighborhood['query_match'].to_string(index=False)) synteny = re.sub("Iaa| ", "", synteny) synteny_alphabet = "".join([ gene['query_match'].replace("Iaa", "").upper() if gene['direction'] == 1 else gene['query_match'].replace("Iaa", "").lower() for index, gene in neighborhood.iterrows() ]) cluster_len = max(neighborhood['end_coord']) - min( neighborhood['start_coord']) assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly']))) accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession']))) title = re.sub("\{|\}|\'", "", str(set(neighborhood['name']))) print(assembly_index_file + " successfully used") return ([ accession, assembly, title, len(neighborhood), cluster_len, synteny, synteny_alphabet, synteny_dir_dist, synteny_dir, cluster_number, coord_list, adj_coord_list, tared_adj_coord_list, itol_diagram_string, nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags, nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC, genomeGC, diffGC, minhash_sim, four_mer_distance, four_mer_freq_cluster, cluster_seq ])