def test_abund_similarity(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) for i in [1, 2]: E2.add_hash(i) assert round(E1.similarity(E1)) == 1.0 assert round(E1.similarity(E2), 2) == 0.5 assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0 assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5
def test_abund_similarity_zero(): E1 = MinHash(n=5, ksize=20, track_abundance=True) E2 = MinHash(n=5, ksize=20, track_abundance=True) for i in [1]: E1.add_hash(i) assert E1.similarity(E2) == 0.0
def fetchneighborhood2(index, features_upstream=0, features_downstream=0): cluster = iaa_positive_df.iloc[index, :] acc = cluster['accession'] assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename']) #make the genome database from the .fa.index file assembly_index_file = 'index_files/' + assembly print(assembly_index_file) db = pd.read_csv(assembly_index_file, sep="!!", header=None, engine='python') #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"] db.columns = [ "filename", "assembly", "accession", "locus_tag", "old_locus_tag", "name", "biosample", "protein_name", "coordinates", "protein_id", "pseudogene", "protein_seq" ] db['direction'] = [ -1 if re.match('complement', c) else 1 for c in db['coordinates'] ] db['start_coord'] = [ re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0) for c in db['coordinates'] ] db['start_coord'] = [ re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord'] ] db['start_coord'] = db['start_coord'].astype(int) db['end_coord'] = [ re.search('(?<=\.(\.|\>))\d+', str(c)).group(0) for c in db['coordinates'] ] db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']] db['end_coord'] = db['end_coord'].astype(int) hit_list = cluster['hit_list'] query_list = cluster['query_list'] cluster_number = cluster['cluster_number'] hit_dict = dict(zip(hit_list, query_list)) genome = db.loc[db['accession'] == acc].copy() start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype( int)[0] - features_upstream stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype( int)[0] + features_downstream neighborhood = genome.loc[start:stop, ].copy() neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict) coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) #function to find GC content of cluster vs genome gbff_str = str(db['filename'][0][1:]) with open("gbff_files_unzipped/" + gbff_str) as file: gbff_file = file.read() genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)", gbff_file)) genome_seq = re.sub('\s|\d|\n', '', genome_seq) Gg = genome_seq.count("g") Gc = genome_seq.count("c") Ga = genome_seq.count("a") Gt = genome_seq.count("t") genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt) start = min(coord_list)[0] end = max(coord_list)[1] regex_str = acc + "[\s+\S+]+\/\/" all_cluster_fasta = re.findall(regex_str, gbff_file)[0] all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)", all_cluster_fasta)[0] all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta) cluster_seq = all_cluster_fasta[start - 1:end - 1] g = cluster_seq.count("g") c = cluster_seq.count("c") a = cluster_seq.count("a") t = cluster_seq.count("t") clusterGC = (g + c) / (g + c + a + t) diffGC = abs(clusterGC - genomeGC) #compare minhash values between cluster and genome kmer_size = 5 n = 0 sc = 1 cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) cluster_minhash.add_sequence(cluster_seq, force=True) cluster_minhash.add_sequence(complement(cluster_seq), force=True) # genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc) genome_minhash.add_sequence(genome_seq, force=True) genome_minhash.add_sequence(complement(genome_seq), force=True) minhash_sim = cluster_minhash.similarity(genome_minhash) # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq) # #print(len(genome_seq)-len(genome_minus_cluster)) # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc) # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True) # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True) # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash) #print(minhash_sim) #compare tetranucleotide frequency between cluster and genomes bases = ['a', 't', 'g', 'c'] four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)] four_mer_count_genome = np.add( [genome_seq.count(i) for i in four_mers], [complement(genome_seq).count(i) for i in four_mers]) four_mer_freq_genome = [ i / sum(four_mer_count_genome) for i in four_mer_count_genome ] four_mer_count_cluster = np.add( [cluster_seq.count(i) for i in four_mers], [complement(cluster_seq).count(i) for i in four_mers]) four_mer_freq_cluster = [ i / sum(four_mer_count_cluster) for i in four_mer_count_cluster ] four_mer_distance = scipy.spatial.distance.cityblock( four_mer_freq_cluster, four_mer_freq_genome) #### if sum(neighborhood[neighborhood['query_match'].notnull()] ['direction']) < 0: neighborhood['actual_start_tmp'] = neighborhood['start_coord'] neighborhood['start_coord'] = neighborhood['end_coord'] * -1 neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1 neighborhood['direction'] = neighborhood['direction'] * -1 neighborhood = neighborhood.sort_values(by='start_coord') neighborhood['query_match'] = neighborhood['query_match'].replace( np.nan, "x") nhbrhood_hit_list = list(neighborhood['query_match']) nhbrhood_locus_tags = list(neighborhood['locus_tag']) nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag']) nhbrhood_prot_ids = list(neighborhood['protein_id']) nhbrhood_prot_name = list(neighborhood['protein_name']) nhbrhood_prot_seq = list(neighborhood['protein_seq']) order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else ("〈 " + gene['query_match'] + " |") for index, gene in neighborhood.iterrows()] dist = list( np.array(neighborhood['start_coord'][1:]) - np.array(neighborhood['end_coord'][:-1])) dist = ["-" + str(d) + "-" for d in dist] adj_coord_list = list( zip(neighborhood['start_coord'], neighborhood['end_coord'], neighborhood['direction'], neighborhood['query_match'])) if min(neighborhood['start_coord']) < 0: tare_value = abs(min(neighborhood['start_coord'])) tared_adj_coord_list = list( zip([v + tare_value for v in neighborhood['start_coord']], [v + tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) else: tare_value = min(neighborhood['start_coord']) tared_adj_coord_list = list( zip([v - tare_value for v in neighborhood['start_coord']], [v - tare_value for v in neighborhood['end_coord']], neighborhood['direction'], neighborhood['query_match'])) # making an ITOL compatible string gene_color_dict = { 'IaaP': '#ff5969', 'IaaQ': '#2db34e', 'IaaR': '#fb77e0', 'IaaA': '#00bc7e', 'IaaB': '#8d006e', 'IaaC': '#cfdd63', 'IaaD': '#0060d0', 'IaaE': '#bb7b00', 'IaaF': '#7c2c29', 'IaaG': '#f1d17a', 'IaaH': '#37589E', 'IaaI': '#ACC92A', 'IaaJ': '#752AC9', 'IaaK': '#D4B5E6', 'IaaL': '#211E45', 'IaaM': '#BFB3E6', 'x': '#d1d1d1' } max_len = tared_adj_coord_list[-1][1] itol_diagram = [] for g in tared_adj_coord_list: gene_string = [] gene_length = g[1] - g[0] if g[2] > 0: gene_string.append('RE') gene_string.append(str(g[0])) gene_string.append(str(g[1] - (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) gene_string.append(',') gene_string.append('TR') gene_string.append(str(g[1] - (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') else: gene_string.append('TL') gene_string.append(str(g[0])) gene_string.append(str(g[0] + (0.1 * gene_length))) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append('') gene_string.append(',') gene_string.append('RE') gene_string.append(str(g[0] + (0.1 * gene_length))) gene_string.append(str(g[1])) #gene_string.append('#34b4eb') gene_string.append(gene_color_dict[g[3]]) gene_string.append(str(g[3])) itol_gene = '|'.join(gene_string) itol_diagram.append(itol_gene) itol_diagram_joined = ",".join(map(str, itol_diagram)) itol_diagram_string = str(max_len) + ',' + itol_diagram_joined itol_diagram_string = re.sub(',\|', ',', itol_diagram_string) #obtains "| A 〉-23-| B 〉-23-| C 〉" synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1]) synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist) #obtains "| A 〉| B 〉| C 〉" synteny_dir = ''.join(order) synteny_dir = re.sub("iaa", "", synteny_dir) #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉" #synteny_dir_pident =''.join(order_pident) #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident) #obtains "A-B-C" synteny = re.sub("\n", "-", neighborhood['query_match'].to_string(index=False)) synteny = re.sub("Iaa| ", "", synteny) synteny_alphabet = "".join([ gene['query_match'].replace("Iaa", "").upper() if gene['direction'] == 1 else gene['query_match'].replace("Iaa", "").lower() for index, gene in neighborhood.iterrows() ]) cluster_len = max(neighborhood['end_coord']) - min( neighborhood['start_coord']) assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly']))) accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession']))) title = re.sub("\{|\}|\'", "", str(set(neighborhood['name']))) print(assembly_index_file + " successfully used") return ([ accession, assembly, title, len(neighborhood), cluster_len, synteny, synteny_alphabet, synteny_dir_dist, synteny_dir, cluster_number, coord_list, adj_coord_list, tared_adj_coord_list, itol_diagram_string, nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags, nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC, genomeGC, diffGC, minhash_sim, four_mer_distance, four_mer_freq_cluster, cluster_seq ])