Python MinHash.similarityの例、sourmash.MinHash.similarity Pythonの例

コード例 #1

0

ファイルを表示

def test_abund_similarity():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)
    for i in [1, 2]:
        E2.add_hash(i)

    assert round(E1.similarity(E1)) == 1.0
    assert round(E1.similarity(E2), 2) == 0.5

    assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0
    assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5

コード例 #2

0

ファイルを表示

ファイル: test_jaccard.py プロジェクト: dib-lab/sourmash

def test_abund_similarity():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)
    for i in [1, 2]:
        E2.add_hash(i)

    assert round(E1.similarity(E1)) == 1.0
    assert round(E1.similarity(E2), 2) == 0.5

    assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0
    assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5

コード例 #3

0

ファイルを表示

def test_abund_similarity_zero():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)

    assert E1.similarity(E2) == 0.0

コード例 #4

0

ファイルを表示

ファイル: test_jaccard.py プロジェクト: dib-lab/sourmash

def test_abund_similarity_zero():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)

    assert E1.similarity(E2) == 0.0

コード例 #5

0

ファイルを表示

def fetchneighborhood2(index, features_upstream=0, features_downstream=0):
    cluster = iaa_positive_df.iloc[index, :]
    acc = cluster['accession']
    assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename'])
    #make the genome database from the .fa.index file
    assembly_index_file = 'index_files/' + assembly
    print(assembly_index_file)
    db = pd.read_csv(assembly_index_file,
                     sep="!!",
                     header=None,
                     engine='python')
    #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"]
    db.columns = [
        "filename", "assembly", "accession", "locus_tag", "old_locus_tag",
        "name", "biosample", "protein_name", "coordinates", "protein_id",
        "pseudogene", "protein_seq"
    ]
    db['direction'] = [
        -1 if re.match('complement', c) else 1 for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord']
    ]
    db['start_coord'] = db['start_coord'].astype(int)
    db['end_coord'] = [
        re.search('(?<=\.(\.|\>))\d+', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']]
    db['end_coord'] = db['end_coord'].astype(int)
    hit_list = cluster['hit_list']
    query_list = cluster['query_list']
    cluster_number = cluster['cluster_number']
    hit_dict = dict(zip(hit_list, query_list))
    genome = db.loc[db['accession'] == acc].copy()
    start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype(
        int)[0] - features_upstream
    stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype(
        int)[0] + features_downstream
    neighborhood = genome.loc[start:stop, ].copy()
    neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict)
    coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    #function to find GC content of cluster vs genome
    gbff_str = str(db['filename'][0][1:])
    with open("gbff_files_unzipped/" + gbff_str) as file:
        gbff_file = file.read()
    genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)",
                                    gbff_file))
    genome_seq = re.sub('\s|\d|\n', '', genome_seq)
    Gg = genome_seq.count("g")
    Gc = genome_seq.count("c")
    Ga = genome_seq.count("a")
    Gt = genome_seq.count("t")
    genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt)
    start = min(coord_list)[0]
    end = max(coord_list)[1]
    regex_str = acc + "[\s+\S+]+\/\/"
    all_cluster_fasta = re.findall(regex_str, gbff_file)[0]
    all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)",
                                   all_cluster_fasta)[0]
    all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta)
    cluster_seq = all_cluster_fasta[start - 1:end - 1]
    g = cluster_seq.count("g")
    c = cluster_seq.count("c")
    a = cluster_seq.count("a")
    t = cluster_seq.count("t")
    clusterGC = (g + c) / (g + c + a + t)
    diffGC = abs(clusterGC - genomeGC)
    #compare minhash values between cluster and genome
    kmer_size = 5
    n = 0
    sc = 1
    cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    cluster_minhash.add_sequence(cluster_seq, force=True)
    cluster_minhash.add_sequence(complement(cluster_seq), force=True)
    #
    genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    genome_minhash.add_sequence(genome_seq, force=True)
    genome_minhash.add_sequence(complement(genome_seq), force=True)
    minhash_sim = cluster_minhash.similarity(genome_minhash)
    # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq)
    # #print(len(genome_seq)-len(genome_minus_cluster))
    # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc)
    # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True)
    # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True)
    # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash)
    #print(minhash_sim)
    #compare tetranucleotide frequency between cluster and genomes
    bases = ['a', 't', 'g', 'c']
    four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)]
    four_mer_count_genome = np.add(
        [genome_seq.count(i) for i in four_mers],
        [complement(genome_seq).count(i) for i in four_mers])
    four_mer_freq_genome = [
        i / sum(four_mer_count_genome) for i in four_mer_count_genome
    ]
    four_mer_count_cluster = np.add(
        [cluster_seq.count(i) for i in four_mers],
        [complement(cluster_seq).count(i) for i in four_mers])
    four_mer_freq_cluster = [
        i / sum(four_mer_count_cluster) for i in four_mer_count_cluster
    ]
    four_mer_distance = scipy.spatial.distance.cityblock(
        four_mer_freq_cluster, four_mer_freq_genome)
    ####
    if sum(neighborhood[neighborhood['query_match'].notnull()]
           ['direction']) < 0:
        neighborhood['actual_start_tmp'] = neighborhood['start_coord']
        neighborhood['start_coord'] = neighborhood['end_coord'] * -1
        neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1
        neighborhood['direction'] = neighborhood['direction'] * -1
        neighborhood = neighborhood.sort_values(by='start_coord')
    neighborhood['query_match'] = neighborhood['query_match'].replace(
        np.nan, "x")
    nhbrhood_hit_list = list(neighborhood['query_match'])
    nhbrhood_locus_tags = list(neighborhood['locus_tag'])
    nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag'])
    nhbrhood_prot_ids = list(neighborhood['protein_id'])
    nhbrhood_prot_name = list(neighborhood['protein_name'])
    nhbrhood_prot_seq = list(neighborhood['protein_seq'])
    order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else
             ("〈 " + gene['query_match'] + " |")
             for index, gene in neighborhood.iterrows()]
    dist = list(
        np.array(neighborhood['start_coord'][1:]) -
        np.array(neighborhood['end_coord'][:-1]))
    dist = ["-" + str(d) + "-" for d in dist]
    adj_coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    if min(neighborhood['start_coord']) < 0:
        tare_value = abs(min(neighborhood['start_coord']))
        tared_adj_coord_list = list(
            zip([v + tare_value for v in neighborhood['start_coord']],
                [v + tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    else:
        tare_value = min(neighborhood['start_coord'])
        tared_adj_coord_list = list(
            zip([v - tare_value for v in neighborhood['start_coord']],
                [v - tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    # making an ITOL compatible string
    gene_color_dict = {
        'IaaP': '#ff5969',
        'IaaQ': '#2db34e',
        'IaaR': '#fb77e0',
        'IaaA': '#00bc7e',
        'IaaB': '#8d006e',
        'IaaC': '#cfdd63',
        'IaaD': '#0060d0',
        'IaaE': '#bb7b00',
        'IaaF': '#7c2c29',
        'IaaG': '#f1d17a',
        'IaaH': '#37589E',
        'IaaI': '#ACC92A',
        'IaaJ': '#752AC9',
        'IaaK': '#D4B5E6',
        'IaaL': '#211E45',
        'IaaM': '#BFB3E6',
        'x': '#d1d1d1'
    }
    max_len = tared_adj_coord_list[-1][1]
    itol_diagram = []
    for g in tared_adj_coord_list:
        gene_string = []
        gene_length = g[1] - g[0]
        if g[2] > 0:
            gene_string.append('RE')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
            gene_string.append(',')
            gene_string.append('TR')
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
        else:
            gene_string.append('TL')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
            gene_string.append(',')
            gene_string.append('RE')
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
        itol_gene = '|'.join(gene_string)
        itol_diagram.append(itol_gene)

    itol_diagram_joined = ",".join(map(str, itol_diagram))
    itol_diagram_string = str(max_len) + ',' + itol_diagram_joined
    itol_diagram_string = re.sub(',\|', ',', itol_diagram_string)
    #obtains "| A 〉-23-| B 〉-23-| C 〉"
    synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1])
    synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist)
    #obtains "| A 〉| B 〉| C 〉"
    synteny_dir = ''.join(order)
    synteny_dir = re.sub("iaa", "", synteny_dir)
    #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉"
    #synteny_dir_pident =''.join(order_pident)
    #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident)
    #obtains "A-B-C"
    synteny = re.sub("\n", "-",
                     neighborhood['query_match'].to_string(index=False))
    synteny = re.sub("Iaa| ", "", synteny)
    synteny_alphabet = "".join([
        gene['query_match'].replace("Iaa", "").upper() if gene['direction']
        == 1 else gene['query_match'].replace("Iaa", "").lower()
        for index, gene in neighborhood.iterrows()
    ])
    cluster_len = max(neighborhood['end_coord']) - min(
        neighborhood['start_coord'])
    assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly'])))
    accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession'])))
    title = re.sub("\{|\}|\'", "", str(set(neighborhood['name'])))
    print(assembly_index_file + " successfully used")
    return ([
        accession, assembly, title,
        len(neighborhood), cluster_len, synteny, synteny_alphabet,
        synteny_dir_dist, synteny_dir, cluster_number, coord_list,
        adj_coord_list, tared_adj_coord_list, itol_diagram_string,
        nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags,
        nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC,
        genomeGC, diffGC, minhash_sim, four_mer_distance,
        four_mer_freq_cluster, cluster_seq
    ])