def test_pickle(track_abundance): import pickle from io import BytesIO e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) e1.add_sequence(seq) fp = BytesIO() pickle.dump(e1, fp) fp2 = BytesIO(fp.getvalue()) e2 = pickle.load(fp2) assert e1.get_mins(with_abundance=track_abundance) == \ e2.get_mins(with_abundance=track_abundance) assert e1.num == e2.num assert e1.ksize == e2.ksize assert e1.is_protein == e2.is_protein assert e1.max_hash == e2.max_hash assert e1.seed == e2.seed
def __init__(self, query_file, ksize, scaled, catlas_name, debug=True): self.filename = query_file self.ksize = ksize self.kmers = set() self.name = None mh = MinHash(0, ksize, scaled=scaled) self.mh = mh self.catlas_name = catlas_name self.debug = debug notify('----') notify('QUERY FILE: {}', self.filename) # build hashes for all the query k-mers & create signature notify('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) for record in screed.open(self.filename): if self.name is None: self.name = record.name if len(record.sequence) >= int(ksize): self.kmers.update(bf.get_kmer_hashes(record.sequence)) mh.add_sequence(record.sequence, True) self.sig = sourmash.SourmashSignature(mh, name=self.name, filename=self.filename) notify('got {} k-mers from query', len(self.kmers)) self.cdbg_match_counts = {} self.catlas_match_counts = {}
def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCAG' e1.add_sequence(seq) for i in range(len(seq) - 3): e2.add(seq[i:i + 4]) assert e1.get_mins() == e2.get_mins() print(e1.get_mins()) assert 726311917625663847 in e1.get_mins() assert 3697418565283905118 in e1.get_mins()
def test_protein_mh(track_abundance): e1 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) for i in range(len(seq) - 5): kmer = seq[i:i + 6] e2.add(kmer) assert e1.get_mins() == e2.get_mins() assert 901193879228338100 in e1.get_mins()
def sketch(args): cwd = os.getcwd() db_path = os.path.join(cwd, args.name + '.db') # check for the existence of the database and tables if os.path.exists(db_path): pass else: print( "Database file not found. Please make sure the name is correct or run mashpit build." ) exit(0) fasta_folder = os.path.join(cwd, 'fasta') if os.path.exists(fasta_folder): pass else: print("Fasta folder not found.") exit(0) sig_file_name = args.name + '.sig' all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta") genomes_list = glob.glob(all_fasta_path) minhashes = [] for genome in genomes_list: mh = MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) minhashes.append(mh) siglist = [] for i in range(len(minhashes)): signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta') siglist.append(SourmashSignature(minhashes[i], name=signame)) with open(sig_file_name, 'w') as f: save_signatures(siglist, fp=f)
def fetchneighborhood2(index, features_upstream=0, features_downstream=0): cluster = iaa_positive_df.iloc[index, :] acc = cluster['accession'] assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename']) #make the genome database from the .fa.index file assembly_index_file = 'index_files/' + assembly print(assembly_index_file) db = pd.read_csv(assembly_index_file, sep="!!", header=None, engine='python') #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"] db.columns = [ "filename", "assembly", "accession", "locus_tag", "old_locus_tag", "name", "biosample", "protein_name", "coordinates", "protein_id", "pseudogene", "protein_seq" ] db['direction'] = [ -1 if re.match('complement', c) else 1 for c in db['coordinates'] ] db['start_coord'] = [ re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0) for c in db['coordinates'] ] db['start_coord'] = [ re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord'] ] db['start_coord'] = db['start_coord'].astype(int) db['end_coord'] = [ re.search('(?<=\.(\.|\>))\d+', str(c)).group(0) for c in db['coordinates'] ] db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']] db['end_coord'] = db['end_coord'].astype(int) hit_list = cluster['hit_list'] query_list = cluster['query_list'] cluster_number = cluster['cluster_number'] hit_dict = dict(zip(hit_list, query_list)) genome = db.loc[db['accession'] == acc].copy() start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype( int)[0] - features_upstream stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype( int)[0] + features_downstream neighborhood = genome.loc[start:stop, ].copy() neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict) coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) #function to find GC content of cluster vs genome gbff_str = str(db['filename'][0][1:]) with open("gbff_files_unzipped/" + gbff_str) as file: gbff_file = file.read() genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)", gbff_file)) genome_seq = re.sub('\s|\d|\n', '', genome_seq) Gg = genome_seq.count("g") Gc = genome_seq.count("c") Ga = genome_seq.count("a") Gt = genome_seq.count("t") genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt) start = min(coord_list)[0] end = max(coord_list)[1] regex_str = acc + "[\s+\S+]+\/\/" all_cluster_fasta = re.findall(regex_str, gbff_file)[0] all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)", all_cluster_fasta)[0] all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta) cluster_seq = all_cluster_fasta[start - 1:end - 1] g = cluster_seq.count("g") c = cluster_seq.count("c") a = cluster_seq.count("a") t = cluster_seq.count("t") clusterGC = (g + c) / (g + c + a + t) diffGC = abs(clusterGC - genomeGC) #compare minhash values between cluster and genome kmer_size = 5 n = 0 sc = 1 cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) cluster_minhash.add_sequence(cluster_seq, force=True) cluster_minhash.add_sequence(complement(cluster_seq), force=True) # genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) genome_minhash.add_sequence(genome_seq, force=True) genome_minhash.add_sequence(complement(genome_seq), force=True) minhash_sim = cluster_minhash.similarity(genome_minhash) # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq) # #print(len(genome_seq)-len(genome_minus_cluster)) # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc) # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True) # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True) # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash) #print(minhash_sim) #compare tetranucleotide frequency between cluster and genomes bases = ['a', 't', 'g', 'c'] four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)] four_mer_count_genome = np.add( [genome_seq.count(i) for i in four_mers], [complement(genome_seq).count(i) for i in four_mers]) four_mer_freq_genome = [ i / sum(four_mer_count_genome) for i in four_mer_count_genome ] four_mer_count_cluster = np.add( [cluster_seq.count(i) for i in four_mers], [complement(cluster_seq).count(i) for i in four_mers]) four_mer_freq_cluster = [ i / sum(four_mer_count_cluster) for i in four_mer_count_cluster ] four_mer_distance = scipy.spatial.distance.cityblock( four_mer_freq_cluster, four_mer_freq_genome) #### if sum(neighborhood[neighborhood['query_match'].notnull()] ['direction']) < 0: neighborhood['actual_start_tmp'] = neighborhood['start_coord'] neighborhood['start_coord'] = neighborhood['end_coord'] * -1 neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1 neighborhood['direction'] = neighborhood['direction'] * -1 neighborhood = neighborhood.sort_values(by='start_coord') neighborhood['query_match'] = neighborhood['query_match'].replace( np.nan, "x") nhbrhood_hit_list = list(neighborhood['query_match']) nhbrhood_locus_tags = list(neighborhood['locus_tag']) nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag']) nhbrhood_prot_ids = list(neighborhood['protein_id']) nhbrhood_prot_name = list(neighborhood['protein_name']) nhbrhood_prot_seq = list(neighborhood['protein_seq']) order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else ("〈 " + gene['query_match'] + " |") for index, gene in neighborhood.iterrows()] dist = list( np.array(neighborhood['start_coord'][1:]) - np.array(neighborhood['end_coord'][:-1])) dist = ["-" + str(d) + "-" for d in dist] adj_coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) if min(neighborhood['start_coord']) < 0: tare_value = abs(min(neighborhood['start_coord'])) tared_adj_coord_list = list( zip([v + tare_value for v in neighborhood['start_coord']], [v + tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) else: tare_value = min(neighborhood['start_coord']) tared_adj_coord_list = list( zip([v - tare_value for v in neighborhood['start_coord']], [v - tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) # making an ITOL compatible string gene_color_dict = { 'IaaP': '#ff5969', 'IaaQ': '#2db34e', 'IaaR': '#fb77e0', 'IaaA': '#00bc7e', 'IaaB': '#8d006e', 'IaaC': '#cfdd63', 'IaaD': '#0060d0', 'IaaE': '#bb7b00', 'IaaF': '#7c2c29', 'IaaG': '#f1d17a', 'IaaH': '#37589E', 'IaaI': '#ACC92A', 'IaaJ': '#752AC9', 'IaaK': '#D4B5E6', 'IaaL': '#211E45', 'IaaM': '#BFB3E6', 'x': '#d1d1d1' } max_len = tared_adj_coord_list[-1][1] itol_diagram = [] for g in tared_adj_coord_list: gene_string = [] gene_length = g[1] - g[0] if g[2] > 0: gene_string.append('RE') gene_string.append(str(g[0])) gene_string.append(str(g[1] - (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) gene_string.append(',') gene_string.append('TR') gene_string.append(str(g[1] - (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') else: gene_string.append('TL') gene_string.append(str(g[0])) gene_string.append(str(g[0] + (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') gene_string.append(',') gene_string.append('RE') gene_string.append(str(g[0] + (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) itol_gene = '|'.join(gene_string) itol_diagram.append(itol_gene) itol_diagram_joined = ",".join(map(str, itol_diagram)) itol_diagram_string = str(max_len) + ',' + itol_diagram_joined itol_diagram_string = re.sub(',\|', ',', itol_diagram_string) #obtains "| A 〉-23-| B 〉-23-| C 〉" synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1]) synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist) #obtains "| A 〉| B 〉| C 〉" synteny_dir = ''.join(order) synteny_dir = re.sub("iaa", "", synteny_dir) #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉" #synteny_dir_pident =''.join(order_pident) #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident) #obtains "A-B-C" synteny = re.sub("\n", "-", neighborhood['query_match'].to_string(index=False)) synteny = re.sub("Iaa| ", "", synteny) synteny_alphabet = "".join([ gene['query_match'].replace("Iaa", "").upper() if gene['direction'] == 1 else gene['query_match'].replace("Iaa", "").lower() for index, gene in neighborhood.iterrows() ]) cluster_len = max(neighborhood['end_coord']) - min( neighborhood['start_coord']) assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly']))) accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession']))) title = re.sub("\{|\}|\'", "", str(set(neighborhood['name']))) print(assembly_index_file + " successfully used") return ([ accession, assembly, title, len(neighborhood), cluster_len, synteny, synteny_alphabet, synteny_dir_dist, synteny_dir, cluster_number, coord_list, adj_coord_list, tared_adj_coord_list, itol_diagram_string, nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags, nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC, genomeGC, diffGC, minhash_sim, four_mer_distance, four_mer_freq_cluster, cluster_seq ])