Exemple #1
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5,
                 ksize=6,
                 is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
Exemple #2
0
    def __init__(self, query_file, ksize, scaled, catlas_name, debug=True):
        self.filename = query_file
        self.ksize = ksize
        self.kmers = set()
        self.name = None
        mh = MinHash(0, ksize, scaled=scaled)
        self.mh = mh
        self.catlas_name = catlas_name
        self.debug = debug

        notify('----')
        notify('QUERY FILE: {}', self.filename)

        # build hashes for all the query k-mers & create signature
        notify('loading query kmers...', end=' ')
        bf = khmer.Nodetable(ksize, 1, 1)

        for record in screed.open(self.filename):
            if self.name is None:
                self.name = record.name
            if len(record.sequence) >= int(ksize):
                self.kmers.update(bf.get_kmer_hashes(record.sequence))
            mh.add_sequence(record.sequence, True)

        self.sig = sourmash.SourmashSignature(mh,
                                              name=self.name,
                                              filename=self.filename)

        notify('got {} k-mers from query', len(self.kmers))

        self.cdbg_match_counts = {}
        self.catlas_match_counts = {}
Exemple #3
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()
Exemple #4
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()
Exemple #5
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Exemple #6
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Exemple #7
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5, ksize=6, is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
Exemple #8
0
def sketch(args):
    cwd = os.getcwd()
    db_path = os.path.join(cwd, args.name + '.db')
    # check for the existence of the database and tables
    if os.path.exists(db_path):
        pass
    else:
        print(
            "Database file not found. Please make sure the name is correct or run mashpit build."
        )
        exit(0)

    fasta_folder = os.path.join(cwd, 'fasta')
    if os.path.exists(fasta_folder):
        pass
    else:
        print("Fasta folder not found.")
        exit(0)

    sig_file_name = args.name + '.sig'

    all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta")
    genomes_list = glob.glob(all_fasta_path)
    minhashes = []
    for genome in genomes_list:
        mh = MinHash(n=1000, ksize=31)
        for record in screed.open(genome):
            mh.add_sequence(record.sequence, True)
        minhashes.append(mh)
    siglist = []

    for i in range(len(minhashes)):
        signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta')
        siglist.append(SourmashSignature(minhashes[i], name=signame))
    with open(sig_file_name, 'w') as f:
        save_signatures(siglist, fp=f)
Exemple #9
0
def fetchneighborhood2(index, features_upstream=0, features_downstream=0):
    cluster = iaa_positive_df.iloc[index, :]
    acc = cluster['accession']
    assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename'])
    #make the genome database from the .fa.index file
    assembly_index_file = 'index_files/' + assembly
    print(assembly_index_file)
    db = pd.read_csv(assembly_index_file,
                     sep="!!",
                     header=None,
                     engine='python')
    #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"]
    db.columns = [
        "filename", "assembly", "accession", "locus_tag", "old_locus_tag",
        "name", "biosample", "protein_name", "coordinates", "protein_id",
        "pseudogene", "protein_seq"
    ]
    db['direction'] = [
        -1 if re.match('complement', c) else 1 for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord']
    ]
    db['start_coord'] = db['start_coord'].astype(int)
    db['end_coord'] = [
        re.search('(?<=\.(\.|\>))\d+', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']]
    db['end_coord'] = db['end_coord'].astype(int)
    hit_list = cluster['hit_list']
    query_list = cluster['query_list']
    cluster_number = cluster['cluster_number']
    hit_dict = dict(zip(hit_list, query_list))
    genome = db.loc[db['accession'] == acc].copy()
    start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype(
        int)[0] - features_upstream
    stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype(
        int)[0] + features_downstream
    neighborhood = genome.loc[start:stop, ].copy()
    neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict)
    coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    #function to find GC content of cluster vs genome
    gbff_str = str(db['filename'][0][1:])
    with open("gbff_files_unzipped/" + gbff_str) as file:
        gbff_file = file.read()
    genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)",
                                    gbff_file))
    genome_seq = re.sub('\s|\d|\n', '', genome_seq)
    Gg = genome_seq.count("g")
    Gc = genome_seq.count("c")
    Ga = genome_seq.count("a")
    Gt = genome_seq.count("t")
    genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt)
    start = min(coord_list)[0]
    end = max(coord_list)[1]
    regex_str = acc + "[\s+\S+]+\/\/"
    all_cluster_fasta = re.findall(regex_str, gbff_file)[0]
    all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)",
                                   all_cluster_fasta)[0]
    all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta)
    cluster_seq = all_cluster_fasta[start - 1:end - 1]
    g = cluster_seq.count("g")
    c = cluster_seq.count("c")
    a = cluster_seq.count("a")
    t = cluster_seq.count("t")
    clusterGC = (g + c) / (g + c + a + t)
    diffGC = abs(clusterGC - genomeGC)
    #compare minhash values between cluster and genome
    kmer_size = 5
    n = 0
    sc = 1
    cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    cluster_minhash.add_sequence(cluster_seq, force=True)
    cluster_minhash.add_sequence(complement(cluster_seq), force=True)
    #
    genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    genome_minhash.add_sequence(genome_seq, force=True)
    genome_minhash.add_sequence(complement(genome_seq), force=True)
    minhash_sim = cluster_minhash.similarity(genome_minhash)
    # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq)
    # #print(len(genome_seq)-len(genome_minus_cluster))
    # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc)
    # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True)
    # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True)
    # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash)
    #print(minhash_sim)
    #compare tetranucleotide frequency between cluster and genomes
    bases = ['a', 't', 'g', 'c']
    four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)]
    four_mer_count_genome = np.add(
        [genome_seq.count(i) for i in four_mers],
        [complement(genome_seq).count(i) for i in four_mers])
    four_mer_freq_genome = [
        i / sum(four_mer_count_genome) for i in four_mer_count_genome
    ]
    four_mer_count_cluster = np.add(
        [cluster_seq.count(i) for i in four_mers],
        [complement(cluster_seq).count(i) for i in four_mers])
    four_mer_freq_cluster = [
        i / sum(four_mer_count_cluster) for i in four_mer_count_cluster
    ]
    four_mer_distance = scipy.spatial.distance.cityblock(
        four_mer_freq_cluster, four_mer_freq_genome)
    ####
    if sum(neighborhood[neighborhood['query_match'].notnull()]
           ['direction']) < 0:
        neighborhood['actual_start_tmp'] = neighborhood['start_coord']
        neighborhood['start_coord'] = neighborhood['end_coord'] * -1
        neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1
        neighborhood['direction'] = neighborhood['direction'] * -1
        neighborhood = neighborhood.sort_values(by='start_coord')
    neighborhood['query_match'] = neighborhood['query_match'].replace(
        np.nan, "x")
    nhbrhood_hit_list = list(neighborhood['query_match'])
    nhbrhood_locus_tags = list(neighborhood['locus_tag'])
    nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag'])
    nhbrhood_prot_ids = list(neighborhood['protein_id'])
    nhbrhood_prot_name = list(neighborhood['protein_name'])
    nhbrhood_prot_seq = list(neighborhood['protein_seq'])
    order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else
             ("〈 " + gene['query_match'] + " |")
             for index, gene in neighborhood.iterrows()]
    dist = list(
        np.array(neighborhood['start_coord'][1:]) -
        np.array(neighborhood['end_coord'][:-1]))
    dist = ["-" + str(d) + "-" for d in dist]
    adj_coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    if min(neighborhood['start_coord']) < 0:
        tare_value = abs(min(neighborhood['start_coord']))
        tared_adj_coord_list = list(
            zip([v + tare_value for v in neighborhood['start_coord']],
                [v + tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    else:
        tare_value = min(neighborhood['start_coord'])
        tared_adj_coord_list = list(
            zip([v - tare_value for v in neighborhood['start_coord']],
                [v - tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    # making an ITOL compatible string
    gene_color_dict = {
        'IaaP': '#ff5969',
        'IaaQ': '#2db34e',
        'IaaR': '#fb77e0',
        'IaaA': '#00bc7e',
        'IaaB': '#8d006e',
        'IaaC': '#cfdd63',
        'IaaD': '#0060d0',
        'IaaE': '#bb7b00',
        'IaaF': '#7c2c29',
        'IaaG': '#f1d17a',
        'IaaH': '#37589E',
        'IaaI': '#ACC92A',
        'IaaJ': '#752AC9',
        'IaaK': '#D4B5E6',
        'IaaL': '#211E45',
        'IaaM': '#BFB3E6',
        'x': '#d1d1d1'
    }
    max_len = tared_adj_coord_list[-1][1]
    itol_diagram = []
    for g in tared_adj_coord_list:
        gene_string = []
        gene_length = g[1] - g[0]
        if g[2] > 0:
            gene_string.append('RE')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
            gene_string.append(',')
            gene_string.append('TR')
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
        else:
            gene_string.append('TL')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
            gene_string.append(',')
            gene_string.append('RE')
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
        itol_gene = '|'.join(gene_string)
        itol_diagram.append(itol_gene)

    itol_diagram_joined = ",".join(map(str, itol_diagram))
    itol_diagram_string = str(max_len) + ',' + itol_diagram_joined
    itol_diagram_string = re.sub(',\|', ',', itol_diagram_string)
    #obtains "| A 〉-23-| B 〉-23-| C 〉"
    synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1])
    synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist)
    #obtains "| A 〉| B 〉| C 〉"
    synteny_dir = ''.join(order)
    synteny_dir = re.sub("iaa", "", synteny_dir)
    #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉"
    #synteny_dir_pident =''.join(order_pident)
    #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident)
    #obtains "A-B-C"
    synteny = re.sub("\n", "-",
                     neighborhood['query_match'].to_string(index=False))
    synteny = re.sub("Iaa| ", "", synteny)
    synteny_alphabet = "".join([
        gene['query_match'].replace("Iaa", "").upper() if gene['direction']
        == 1 else gene['query_match'].replace("Iaa", "").lower()
        for index, gene in neighborhood.iterrows()
    ])
    cluster_len = max(neighborhood['end_coord']) - min(
        neighborhood['start_coord'])
    assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly'])))
    accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession'])))
    title = re.sub("\{|\}|\'", "", str(set(neighborhood['name'])))
    print(assembly_index_file + " successfully used")
    return ([
        accession, assembly, title,
        len(neighborhood), cluster_len, synteny, synteny_alphabet,
        synteny_dir_dist, synteny_dir, cluster_number, coord_list,
        adj_coord_list, tared_adj_coord_list, itol_diagram_string,
        nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags,
        nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC,
        genomeGC, diffGC, minhash_sim, four_mer_distance,
        four_mer_freq_cluster, cluster_seq
    ])