Esempio n. 1
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5,
                 ksize=6,
                 is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
Esempio n. 2
0
    def __init__(self, query_file, ksize, scaled, catlas_name, debug=True):
        self.filename = query_file
        self.ksize = ksize
        self.kmers = set()
        self.name = None
        mh = MinHash(0, ksize, scaled=scaled)
        self.mh = mh
        self.catlas_name = catlas_name
        self.debug = debug

        notify('----')
        notify('QUERY FILE: {}', self.filename)

        # build hashes for all the query k-mers & create signature
        notify('loading query kmers...', end=' ')
        bf = khmer.Nodetable(ksize, 1, 1)

        for record in screed.open(self.filename):
            if self.name is None:
                self.name = record.name
            if len(record.sequence) >= int(ksize):
                self.kmers.update(bf.get_kmer_hashes(record.sequence))
            mh.add_sequence(record.sequence, True)

        self.sig = sourmash.SourmashSignature(mh,
                                              name=self.name,
                                              filename=self.filename)

        notify('got {} k-mers from query', len(self.kmers))

        self.cdbg_match_counts = {}
        self.catlas_match_counts = {}
Esempio n. 3
0
def test_abund_similarity_zero():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)

    assert E1.similarity(E2) == 0.0
Esempio n. 4
0
def test_common_1(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    assert E1.count_common(E2) == 4
    assert E2.count_common(E1) == 4
Esempio n. 5
0
def test_diff_seed(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=1)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=2)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    with pytest.raises(ValueError):
        E1.count_common(E2)
def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size):
    a = MinHash(sketch_size, 10, track_abundance=True)
    oracle = dict(zip(hashes, abundances))

    a.set_abundances(oracle)

    mins = a.get_mins(with_abundance=True)
    size = min(sum(1 for v in oracle.values() if v > 0), sketch_size)
    assert len(mins) == size

    for k, v in mins.items():
        assert oracle[k] == v
Esempio n. 7
0
def test_jaccard_1(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    # here the union is [1, 2, 3, 4, 5]
    # and the intesection is [1, 2, 3, 4] => 4/5.

    assert round(E1.jaccard(E2), 2) == round(4 / 5.0, 2)
    assert round(E2.jaccard(E1), 2) == round(4 / 5.0, 2)
Esempio n. 8
0
def test_abund_similarity():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)
    for i in [1, 2]:
        E2.add_hash(i)

    assert round(E1.similarity(E1)) == 1.0
    assert round(E1.similarity(E2), 2) == 0.5

    assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0
    assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5
Esempio n. 9
0
def test_jaccard_2_difflen(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4]:
        E2.add_hash(i)

    print(E1.jaccard(E2))
    assert round(E1.jaccard(E2), 2) == 4 / 5.0
    assert round(E2.jaccard(E1), 2) == 4 / 5.0
Esempio n. 10
0
def test_bad_construct_2(track_abundance):
    try:
        e1 = MinHash(n=100, is_protein=False,
                        track_abundance=track_abundance)
        assert 0, "require ksize in constructor"
    except TypeError:
        pass
Esempio n. 11
0
def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
    a = MinHash(0, 10, track_abundance=True, scaled=scaled)
    oracle = dict(zip(hashes, abundances))

    a.set_abundances(oracle)

    max_hash = get_max_hash_for_scaled(scaled)
    below_max_hash = sum(1 for (k, v) in oracle.items()
                         if k <= max_hash and v > 0)

    mins = a.get_mins(with_abundance=True)
    assert len(mins) == below_max_hash

    for k, v in mins.items():
        assert oracle[k] == v
        assert k <= max_hash
        assert v > 0
Esempio n. 12
0
def test_diff_seed(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=1)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance, seed=2)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    with pytest.raises(ValueError):
        E1.count_common(E2)
Esempio n. 13
0
def test_common_1(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    assert E1.count_common(E2) == 4
    assert E2.count_common(E1) == 4
Esempio n. 14
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=6, is_protein=True,
                    track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Esempio n. 15
0
    def _signatures(self):
        "Create a _signatures member dictionary that contains {idx: sigobj}."
        from sourmash import MinHash, SourmashSignature

        is_protein = False
        is_hp = False
        is_dayhoff = False
        if self.moltype == 'protein':
            is_protein = True
        elif self.moltype == 'hp':
            is_hp = True
        elif self.moltype == 'dayhoff':
            is_dayhoff = True
        minhash = MinHash(n=0,
                          ksize=self.ksize,
                          scaled=self.scaled,
                          is_protein=is_protein,
                          hp=is_hp,
                          dayhoff=is_dayhoff)

        debug('creating signatures for LCA DB...')
        mhd = defaultdict(minhash.copy_and_clear)
        temp_vals = defaultdict(list)

        # invert the hashval_to_idx dictionary
        for (hashval, idlist) in self.hashval_to_idx.items():
            for idx in idlist:
                temp_hashes = temp_vals[idx]
                temp_hashes.append(hashval)

                # 50 is an arbitrary number. If you really want
                # to micro-optimize, list is resized and grow in this pattern:
                # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
                # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57)
                if len(temp_hashes) > 50:
                    mhd[idx].add_many(temp_hashes)

                    # Sigh, python 2... when it goes away,
                    # we can do `temp_hashes.clear()` instead.
                    del temp_vals[idx]

        # We loop temp_vals again to add any remainder hashes
        # (each list of hashes is smaller than 50 items)
        for sig, vals in temp_vals.items():
            mhd[sig].add_many(vals)

        sigd = {}
        for idx, mh in mhd.items():
            ident = self.idx_to_ident[idx]
            name = self.ident_to_name[ident]
            sigd[idx] = SourmashSignature(mh, name=name)

        debug('=> {} signatures!', len(sigd))
        return sigd
Esempio n. 16
0
def test_jaccard_1(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4, 6]:
        E2.add_hash(i)

    # here the union is [1, 2, 3, 4, 5]
    # and the intesection is [1, 2, 3, 4] => 4/5.

    assert round(E1.jaccard(E2), 2) == round(4 / 5.0, 2)
    assert round(E2.jaccard(E1), 2) == round(4 / 5.0, 2)
Esempio n. 17
0
def test_abund_similarity():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)
    for i in [1, 2]:
        E2.add_hash(i)

    assert round(E1.similarity(E1)) == 1.0
    assert round(E1.similarity(E2), 2) == 0.5

    assert round(E1.similarity(E1, ignore_abundance=True)) == 1.0
    assert round(E1.similarity(E2, ignore_abundance=True), 2) == 0.5
Esempio n. 18
0
def test_build_hashCounter():
    mh1 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh2 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh1.add_many((1, 2, 3, 4))
    mh2.add_many((1, 2, 5))
    true_res = Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1})

    ss1 = SourmashSignature(mh1)
    ss2 = SourmashSignature(mh2)

    counts = Counter()
    hc = build_hashCounter([ss1, ss2], counts)
    print("Hash Counter: ", hc)
    assert hc == true_res
Esempio n. 19
0
def test_abund_similarity_zero():
    E1 = MinHash(n=5, ksize=20, track_abundance=True)
    E2 = MinHash(n=5, ksize=20, track_abundance=True)

    for i in [1]:
        E1.add_hash(i)

    assert E1.similarity(E2) == 0.0
Esempio n. 20
0
def test_drop_below_mincount_threshold():
    mh1 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh2 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh1.add_many((1, 2, 3, 4))
    mh2.add_many((1, 1, 2, 5))

    ss1 = SourmashSignature(mh1)
    ss2 = SourmashSignature(mh2)

    counts = Counter()
    hc = build_hashCounter([ss1, ss2], counts)
    kept_hashes = drop_below_mincount(hc, 3)
    true_kept = Counter({1: 3})
    print("kept hashes: ", kept_hashes)
    assert kept_hashes == true_kept
Esempio n. 21
0
def sketch(args):
    cwd = os.getcwd()
    db_path = os.path.join(cwd, args.name + '.db')
    # check for the existence of the database and tables
    if os.path.exists(db_path):
        pass
    else:
        print(
            "Database file not found. Please make sure the name is correct or run mashpit build."
        )
        exit(0)

    fasta_folder = os.path.join(cwd, 'fasta')
    if os.path.exists(fasta_folder):
        pass
    else:
        print("Fasta folder not found.")
        exit(0)

    sig_file_name = args.name + '.sig'

    all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta")
    genomes_list = glob.glob(all_fasta_path)
    minhashes = []
    for genome in genomes_list:
        mh = MinHash(n=1000, ksize=31)
        for record in screed.open(genome):
            mh.add_sequence(record.sequence, True)
        minhashes.append(mh)
    siglist = []

    for i in range(len(minhashes)):
        signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta')
        siglist.append(SourmashSignature(minhashes[i], name=signame))
    with open(sig_file_name, 'w') as f:
        save_signatures(siglist, fp=f)
Esempio n. 22
0
def test_pickle(track_abundance):
    import pickle
    from io import BytesIO

    e1 = MinHash(n=5, ksize=6, is_protein=False,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)
    e1.add_sequence(seq)

    fp = BytesIO()
    pickle.dump(e1, fp)

    fp2 = BytesIO(fp.getvalue())
    e2 = pickle.load(fp2)

    assert e1.get_mins(with_abundance=track_abundance) == \
           e2.get_mins(with_abundance=track_abundance)
    assert e1.num == e2.num
    assert e1.ksize == e2.ksize
    assert e1.is_protein == e2.is_protein
    assert e1.max_hash == e2.max_hash
    assert e1.seed == e2.seed
Esempio n. 23
0
        def to_sourmash(self):
            try:
                from sourmash import MinHash
            except ImportError:
                print(
                    'Must install python sourmash to convert to sourmash.MinHash',
                    file=sys.stderr)
                return None

            sig = MinHash(self.num(),
                          self.ksize(),
                          is_protein=self.is_protein(),
                          dayhoff=self.dayhoff(),
                          hp=self.hp(),
                          track_abundance=self.track_abundance(),
                          seed=self.seed(),
                          mins=self.mins(),
                          max_hash=self.max_hash())

            return sig
Esempio n. 24
0
            # remove hashes that occur only once
            for hashval, ct in counts.copy().items():
                print(f"{hashval}:{ct}")
                if ct < min_count:
                    counts.pop(hashval)
            # write out hashes

            # let's try building a sig. we will use this sig later to intersect with sample-specific sigs
            new_mins = set(counts.keys())
            print(len(new_mins))
            with open(outhashes, "w") as out:
                for hsh in new_mins:
                    out.write(str(hsh) + '\n')
            if len(new_mins) > 0:
                minhash = MinHash(
                    n=0, ksize=ksize, scaled=scaled
                )  # scaled=1 so we keep all (though these were previously at some other scaled val)
                minhash.add_many(set(counts.keys()))
                # write sig to file
                sigobj = sourmash.SourmashSignature(
                    minhash,
                    name=f"aggregated_hashvals_above_{min_count}",
                    filename=f"generated with drop_unique_hashes.py")
                sigobjs += [sigobj]

## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes
with open(outsig, 'wt') as sigout:
    sourmash.save_signatures(sigobjs, sigout)
    #notify('wrote signature to {}', args.output)

# write out hashes to a text file
def main(argv):
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('mh_index_picklefile', help='pickled hashval index')
    p.add_argument('hashval_list', help='file with list of hashvals')
    p.add_argument('output')
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('--scaled',
                   default=1000,
                   type=float,
                   help="scaled value for contigs minhash output")
    p.add_argument('-v', '--verbose', action='store_true')

    args = p.parse_args(argv)

    # create output directory if it doesn't exist.
    outdir = args.output
    notify('putting output in {}', outdir)
    os.makedirs(os.path.join(outdir, "contigs"), exist_ok=True)

    if not os.path.isdir(outdir):
        error("output '{}' is not a directory and cannot be made", outdir)
        sys.exit(-1)

    # load picklefile
    with open(args.mh_index_picklefile, 'rb') as fp:
        hashval_to_contig_id = pickle.load(fp)
    notify('loaded {} hash value -> cdbg_id mappings from {}',
           len(hashval_to_contig_id), args.mh_index_picklefile)

    # load list of desired hashvals
    hashvals = [int(x.strip()) for x in open(args.hashval_list, 'rt')]
    hashvals = set(hashvals)
    notify('loaded {} search hashvalues from {}', len(hashvals),
           args.hashval_list)

    if not len(hashvals):
        print('No hash values to search!', file=sys.stderr)
        sys.exit(-1)

    # load catlas DAG
    catlas = CAtlas(args.catlas_prefix)
    notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix)
    notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg))

    # find the contigs filename
    contigs_file = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # get a single ksize & scaled
    ksize = int(args.ksize)
    scaled = int(args.scaled)

    # record command line
    with open(os.path.join(outdir, 'command.txt'), 'wt') as fp:
        fp.write(str(sys.argv))
        fp.write("\n")

    # output results.csv in the output directory:
    csvoutfp = open(os.path.join(outdir, 'hashval_results.csv'), 'wt')
    csv_writer = csv.writer(csvoutfp)
    csv_writer.writerow(['hashval', 'bp', 'contigs'])

    # iterate over each query, do the thing.
    n_found = 0
    for hashval in hashvals:
        notify('----')
        notify('QUERY HASHVAL: {}', hashval)

        mh = MinHash(0, ksize, scaled=scaled)
        result = execute_query(hashval, catlas, hashval_to_contig_id, mh=mh)
        notify('done searching!')
        if not result:
            notify("no result for hashval {}", hashval)
            continue

        result.retrieve_contigs(contigs_file)
        result.write(csv_writer, csvoutfp, outdir)

        assert hashval in mh.get_mins()

        n_found += 1
    # end main loop!

    notify('----')
    notify("Done! Found {} hashvals of {} in {} with k={}", n_found,
           len(hashvals), args.catlas_prefix, ksize)
    notify("Results are in directory '{}'", outdir)

    return 0
Esempio n. 26
0
def test_protein_mh(track_abundance):
    e1 = MinHash(n=5,
                 ksize=6,
                 is_protein=True,
                 track_abundance=track_abundance)
    e2 = MinHash(n=5,
                 ksize=6,
                 is_protein=True,
                 track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCG'
    e1.add_sequence(seq)

    for i in range(len(seq) - 5):
        kmer = seq[i:i + 6]
        e2.add(kmer)

    assert e1.get_mins() == e2.get_mins()
    assert 901193879228338100 in e1.get_mins()
Esempio n. 27
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()
Esempio n. 28
0
def test_jaccard_2_difflen(track_abundance):
    E1 = MinHash(n=5, ksize=20, track_abundance=track_abundance)
    E2 = MinHash(n=5, ksize=20, track_abundance=track_abundance)

    for i in [1, 2, 3, 4, 5]:
        E1.add_hash(i)
    for i in [1, 2, 3, 4]:
        E2.add_hash(i)

    print(E1.jaccard(E2))
    assert round(E1.jaccard(E2), 2) == 4 / 5.0
    assert round(E2.jaccard(E1), 2) == 4 / 5.0
Esempio n. 29
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('hashfile') 					# file that contains hashes
    p.add_argument('-o', '--output', default=None,
                   help='file to output signature to')
    p.add_argument('-k', '--ksize', default=None, type=int)
    p.add_argument('--scaled', default=None, type=int)
    p.add_argument('--num', default=None, type=int)
    p.add_argument('--name', default='', help='signature name')
    p.add_argument('--filename', default='',
                   help='filename to add to signature')
    args = p.parse_args()

    # check arguments.
    if args.scaled and args.num:
        error('cannot specify both --num and --scaled! exiting.')
        return -1

    if not args.ksize:
        error('must specify --ksize')
        return -1

    if not args.output:
        error('must specify --output')
        return -1

    # first, load in all the hashes
    hashes = set()
    for line in open(args.hashfile, 'rt'):
        hashval = int(line.strip())
        hashes.add(hashval)

    if not hashes:
        error("ERROR, no hashes loaded from {}!", args.hashfile)
        return -1

    notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile)

    # now, create the MinHash object that we'll use.
    scaled = 0
    num = 0
    if args.scaled:
        scaled = args.scaled
    elif args.num:
        num = args.num
    else:
        notify('setting --num automatically from the number of hashes.')
        num = len(hashes)

    # construct empty MinHash object according to args
    minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled)

    # add hashes into!
    minhash.add_many(hashes)

    if len(minhash) < len(hashes):
        notify("WARNING: loaded {} hashes, but only {} made it into MinHash.",
               len(hashes), len(minhash))
        if scaled:
            notify("This is probably because of the scaled argument.")
        elif args.num:
            notify("This is probably because your --num is set to {}",
                   args.num)

    if num > len(minhash):
        notify("WARNING: --num set to {}, but only {} hashes in signature.",
               num, len(minhash))

    sigobj = sourmash.SourmashSignature(minhash, name=args.name,
                                        filename=args.filename)

    with open(args.output, 'wt') as fp:
        sourmash.save_signatures([sigobj], fp)
    notify('wrote signature to {}', args.output)
Esempio n. 30
0
def fetchneighborhood2(index, features_upstream=0, features_downstream=0):
    cluster = iaa_positive_df.iloc[index, :]
    acc = cluster['accession']
    assembly = re.sub('.gbff', '_proteins.fa.indexprot', cluster['filename'])
    #make the genome database from the .fa.index file
    assembly_index_file = 'index_files/' + assembly
    print(assembly_index_file)
    db = pd.read_csv(assembly_index_file,
                     sep="!!",
                     header=None,
                     engine='python')
    #db.columns = ["filename","assembly","accession","locus_tag","old_locus_tag","name","biosample","protein_name","coordinates","protein_id"]
    db.columns = [
        "filename", "assembly", "accession", "locus_tag", "old_locus_tag",
        "name", "biosample", "protein_name", "coordinates", "protein_id",
        "pseudogene", "protein_seq"
    ]
    db['direction'] = [
        -1 if re.match('complement', c) else 1 for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.search('\d+?(?=\.\.(\d|\>))', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['start_coord'] = [
        re.sub('complement|>|<|\)|\(', "", c) for c in db['start_coord']
    ]
    db['start_coord'] = db['start_coord'].astype(int)
    db['end_coord'] = [
        re.search('(?<=\.(\.|\>))\d+', str(c)).group(0)
        for c in db['coordinates']
    ]
    db['end_coord'] = [re.sub('>|<|\)|\(', "", c) for c in db['end_coord']]
    db['end_coord'] = db['end_coord'].astype(int)
    hit_list = cluster['hit_list']
    query_list = cluster['query_list']
    cluster_number = cluster['cluster_number']
    hit_dict = dict(zip(hit_list, query_list))
    genome = db.loc[db['accession'] == acc].copy()
    start = genome[genome['locus_tag'] == hit_list[0]].index.values.astype(
        int)[0] - features_upstream
    stop = genome[genome['locus_tag'] == hit_list[-1]].index.values.astype(
        int)[0] + features_downstream
    neighborhood = genome.loc[start:stop, ].copy()
    neighborhood['query_match'] = neighborhood['locus_tag'].map(hit_dict)
    coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    #function to find GC content of cluster vs genome
    gbff_str = str(db['filename'][0][1:])
    with open("gbff_files_unzipped/" + gbff_str) as file:
        gbff_file = file.read()
    genome_seq = "".join(re.findall("(?<=ORIGIN)[\s+\S+]+?(?=\/\/)",
                                    gbff_file))
    genome_seq = re.sub('\s|\d|\n', '', genome_seq)
    Gg = genome_seq.count("g")
    Gc = genome_seq.count("c")
    Ga = genome_seq.count("a")
    Gt = genome_seq.count("t")
    genomeGC = (Gg + Gc) / (Gg + Gc + Ga + Gt)
    start = min(coord_list)[0]
    end = max(coord_list)[1]
    regex_str = acc + "[\s+\S+]+\/\/"
    all_cluster_fasta = re.findall(regex_str, gbff_file)[0]
    all_cluster_fasta = re.findall("(?<=ORIGIN)[\s+\S+]+(?=\/\/)",
                                   all_cluster_fasta)[0]
    all_cluster_fasta = re.sub(" |\d|\n", "", all_cluster_fasta)
    cluster_seq = all_cluster_fasta[start - 1:end - 1]
    g = cluster_seq.count("g")
    c = cluster_seq.count("c")
    a = cluster_seq.count("a")
    t = cluster_seq.count("t")
    clusterGC = (g + c) / (g + c + a + t)
    diffGC = abs(clusterGC - genomeGC)
    #compare minhash values between cluster and genome
    kmer_size = 5
    n = 0
    sc = 1
    cluster_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    cluster_minhash.add_sequence(cluster_seq, force=True)
    cluster_minhash.add_sequence(complement(cluster_seq), force=True)
    #
    genome_minhash = MinHash(n=n, ksize=kmer_size, scaled=sc)
    genome_minhash.add_sequence(genome_seq, force=True)
    genome_minhash.add_sequence(complement(genome_seq), force=True)
    minhash_sim = cluster_minhash.similarity(genome_minhash)
    # genome_minus_cluster=re.sub(cluster_seq,'',genome_seq)
    # #print(len(genome_seq)-len(genome_minus_cluster))
    # genome_minus_cluster_minhash=MinHash(n=n, ksize=kmer_size,scaled=sc)
    # genome_minus_cluster_minhash.add_sequence(genome_minus_cluster,force=True)
    # genome_minus_cluster_minhash.add_sequence(complement(genome_minus_cluster),force=True)
    # minhash_sim_minus_cluster=cluster_minhash.similarity(genome_minus_cluster_minhash)
    #print(minhash_sim)
    #compare tetranucleotide frequency between cluster and genomes
    bases = ['a', 't', 'g', 'c']
    four_mers = [''.join(p) for p in itertools.product(bases, repeat=4)]
    four_mer_count_genome = np.add(
        [genome_seq.count(i) for i in four_mers],
        [complement(genome_seq).count(i) for i in four_mers])
    four_mer_freq_genome = [
        i / sum(four_mer_count_genome) for i in four_mer_count_genome
    ]
    four_mer_count_cluster = np.add(
        [cluster_seq.count(i) for i in four_mers],
        [complement(cluster_seq).count(i) for i in four_mers])
    four_mer_freq_cluster = [
        i / sum(four_mer_count_cluster) for i in four_mer_count_cluster
    ]
    four_mer_distance = scipy.spatial.distance.cityblock(
        four_mer_freq_cluster, four_mer_freq_genome)
    ####
    if sum(neighborhood[neighborhood['query_match'].notnull()]
           ['direction']) < 0:
        neighborhood['actual_start_tmp'] = neighborhood['start_coord']
        neighborhood['start_coord'] = neighborhood['end_coord'] * -1
        neighborhood['end_coord'] = neighborhood['actual_start_tmp'] * -1
        neighborhood['direction'] = neighborhood['direction'] * -1
        neighborhood = neighborhood.sort_values(by='start_coord')
    neighborhood['query_match'] = neighborhood['query_match'].replace(
        np.nan, "x")
    nhbrhood_hit_list = list(neighborhood['query_match'])
    nhbrhood_locus_tags = list(neighborhood['locus_tag'])
    nhbrhood_old_locus_tags = list(neighborhood['old_locus_tag'])
    nhbrhood_prot_ids = list(neighborhood['protein_id'])
    nhbrhood_prot_name = list(neighborhood['protein_name'])
    nhbrhood_prot_seq = list(neighborhood['protein_seq'])
    order = [("| " + gene['query_match'] + " 〉") if gene['direction'] == 1 else
             ("〈 " + gene['query_match'] + " |")
             for index, gene in neighborhood.iterrows()]
    dist = list(
        np.array(neighborhood['start_coord'][1:]) -
        np.array(neighborhood['end_coord'][:-1]))
    dist = ["-" + str(d) + "-" for d in dist]
    adj_coord_list = list(
        zip(neighborhood['start_coord'], neighborhood['end_coord'],
            neighborhood['direction'], neighborhood['query_match']))
    if min(neighborhood['start_coord']) < 0:
        tare_value = abs(min(neighborhood['start_coord']))
        tared_adj_coord_list = list(
            zip([v + tare_value for v in neighborhood['start_coord']],
                [v + tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    else:
        tare_value = min(neighborhood['start_coord'])
        tared_adj_coord_list = list(
            zip([v - tare_value for v in neighborhood['start_coord']],
                [v - tare_value for v in neighborhood['end_coord']],
                neighborhood['direction'], neighborhood['query_match']))
    # making an ITOL compatible string
    gene_color_dict = {
        'IaaP': '#ff5969',
        'IaaQ': '#2db34e',
        'IaaR': '#fb77e0',
        'IaaA': '#00bc7e',
        'IaaB': '#8d006e',
        'IaaC': '#cfdd63',
        'IaaD': '#0060d0',
        'IaaE': '#bb7b00',
        'IaaF': '#7c2c29',
        'IaaG': '#f1d17a',
        'IaaH': '#37589E',
        'IaaI': '#ACC92A',
        'IaaJ': '#752AC9',
        'IaaK': '#D4B5E6',
        'IaaL': '#211E45',
        'IaaM': '#BFB3E6',
        'x': '#d1d1d1'
    }
    max_len = tared_adj_coord_list[-1][1]
    itol_diagram = []
    for g in tared_adj_coord_list:
        gene_string = []
        gene_length = g[1] - g[0]
        if g[2] > 0:
            gene_string.append('RE')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
            gene_string.append(',')
            gene_string.append('TR')
            gene_string.append(str(g[1] - (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
        else:
            gene_string.append('TL')
            gene_string.append(str(g[0]))
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append('')
            gene_string.append(',')
            gene_string.append('RE')
            gene_string.append(str(g[0] + (0.1 * gene_length)))
            gene_string.append(str(g[1]))
            #gene_string.append('#34b4eb')
            gene_string.append(gene_color_dict[g[3]])
            gene_string.append(str(g[3]))
        itol_gene = '|'.join(gene_string)
        itol_diagram.append(itol_gene)

    itol_diagram_joined = ",".join(map(str, itol_diagram))
    itol_diagram_string = str(max_len) + ',' + itol_diagram_joined
    itol_diagram_string = re.sub(',\|', ',', itol_diagram_string)
    #obtains "| A 〉-23-| B 〉-23-| C 〉"
    synteny_dir_dist = ''.join(sum(zip(order, dist + [0]), ())[:-1])
    synteny_dir_dist = re.sub("iaa", "", synteny_dir_dist)
    #obtains "| A 〉| B 〉| C 〉"
    synteny_dir = ''.join(order)
    synteny_dir = re.sub("iaa", "", synteny_dir)
    #obtains "| A:23.23 〉| B:23.23〉| C:23.23 〉"
    #synteny_dir_pident =''.join(order_pident)
    #synteny_dir_pident = re.sub("iaa" ,"", synteny_dir_pident)
    #obtains "A-B-C"
    synteny = re.sub("\n", "-",
                     neighborhood['query_match'].to_string(index=False))
    synteny = re.sub("Iaa| ", "", synteny)
    synteny_alphabet = "".join([
        gene['query_match'].replace("Iaa", "").upper() if gene['direction']
        == 1 else gene['query_match'].replace("Iaa", "").lower()
        for index, gene in neighborhood.iterrows()
    ])
    cluster_len = max(neighborhood['end_coord']) - min(
        neighborhood['start_coord'])
    assembly = re.sub("\{|\}|\'|>", "", str(set(neighborhood['assembly'])))
    accession = re.sub("\{|\}|\'", "", str(set(neighborhood['accession'])))
    title = re.sub("\{|\}|\'", "", str(set(neighborhood['name'])))
    print(assembly_index_file + " successfully used")
    return ([
        accession, assembly, title,
        len(neighborhood), cluster_len, synteny, synteny_alphabet,
        synteny_dir_dist, synteny_dir, cluster_number, coord_list,
        adj_coord_list, tared_adj_coord_list, itol_diagram_string,
        nhbrhood_hit_list, nhbrhood_locus_tags, nhbrhood_old_locus_tags,
        nhbrhood_prot_ids, nhbrhood_prot_name, nhbrhood_prot_seq, clusterGC,
        genomeGC, diffGC, minhash_sim, four_mer_distance,
        four_mer_freq_cluster, cluster_seq
    ])
Esempio n. 31
0
def test_dna_mh(track_abundance):
    e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance)
    e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance)

    seq = 'ATGGCAGTGACGATGCCAG'
    e1.add_sequence(seq)
    for i in range(len(seq) - 3):
        e2.add(seq[i:i + 4])

    assert e1.get_mins() == e2.get_mins()
    print(e1.get_mins())
    assert 726311917625663847 in e1.get_mins()
    assert 3697418565283905118 in e1.get_mins()