Example #1
0
def count_kmers(
        fg_genome_fp,
        bg_genome_fp,
        min_size,
        max_size,
        min_fg_bind,
        max_bg_bind,
        max_dimer_bp,
        primer_db,
        exclude_fp,
        exclude_threshold,
        **kwargs):
    assert os.path.isfile(fg_genome_fp)
    assert os.path.isfile(bg_genome_fp)

    check_create_tables(primer_db)
    swga.utils.mkdirp(output_dir)

    kmers = []
    for k in xrange(min_size, max_size + 1):
        fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir)
        bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir)

        if exclude_fp:
            assert os.path.isfile(exclude_fp)
            ex = swga.primers.count_kmers(
                k, exclude_fp, output_dir, exclude_threshold)
        else:
            ex = {}

        # Keep kmers found in foreground, merging bg binding values, and
        # excluding those found in the excluded fasta

        kmers = [
            primer_dict(seq, fg, bg, min_fg_bind, max_bg_bind, max_dimer_bp)
            for seq in fg.viewkeys() if seq not in ex.viewkeys()
        ]

        kmers = filter(lambda x: x != {}, kmers)

        nkmers = len(kmers)

        chunk_size = 199
        swga.message("Writing {n} {k}-mers into db in blocks of {cs}..."
                     .format(n=nkmers*2, k=k, cs=chunk_size))
        database.add_primers(kmers, chunk_size, add_revcomp=True)

    swga.message("Counted kmers in range %d-%d" % (min_size, max_size))
Example #2
0
def count_specific_kmers(
        kmers,
        fg_genome_fp,
        bg_genome_fp,
        primer_db,
        **kwargs):

    try:
        # Skip primers that already exist and warn users
        existing = [p.seq for p in Primer.select().where(Primer.seq << kmers)]
        for p in existing:
            swga.message("{} already exists in db, skipping...".format(p))
        kmers = filter(lambda p: p not in existing, kmers)
    except OperationalError:
        # If this fails due to an OperationalError, it probably means the
        # database tables haven't been created yet
        check_create_tables(primer_db)
        swga.utils.mkdirp(output_dir)

    # Group the kmers by length to avoid repeatedly counting kmers of the same size
    kmers_by_length = defaultdict(list)
    for kmer in kmers:
        kmers_by_length[len(kmer)].append(kmer)

    for k, mers in kmers_by_length.items():
        fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir, 1)
        bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir, 1)
        primers = []
        for mer in mers:
            try:
                primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
            except KeyError:
                swga.message(
                    "{} does not exist in foreground genome, skipping..."
                    .format(mer))

        # Omitting any primers that were returned empty
        # primers = filter(lambda p: p == {}, primers)
        chunk_size = 199
        swga.message(
            "Writing {n} {k}-mers into db in blocks of {cs}..."
            .format(n=len(primers), k=k, cs=chunk_size))
        database.add_primers(primers, chunk_size, add_revcomp=False)
Example #3
0
 def test_add_primers(self, initdb):
     '''Must add the reverse complement of a primer if requested.'''
     primers = [{'seq': "AAAA"}]
     database.add_primers(primers, add_revcomp=True)
     assert Primer.select().where(Primer.seq == "TTTT").count() == 1