Esempio n. 1
0
def chunk_iterator(itr, fn, n=100, show_progress=True, label=None):
    '''Breaks an iterable into chunks and applies a function to each chunk.
    Arguments:
    - itr the iterable to be chunked
    - fn the function to be applied to each chunks
    - n the size of each chunk
    - show_progress show a progress bar
    - label the label to show on the progress bar
    '''
    if len(itr) == 0:
        return
    label = "" if label is None else label
    if len(itr)/n <= 1:
        show_progress = False
        swga.message(label)
    chunked_itr = chunks(itr, n)
    if show_progress:
        chunked = progress.bar(
            chunked_itr,
            label=label,
            expected_size=max(len(itr)/n, 1))
    else:
        chunked = chunked_itr
    for chunk in chunked:
        fn(chunk)
Esempio n. 2
0
    def run(self):
        if self.max_sets < 1:
            self.max_sets = float("inf")
        # We need to clear all the previously-used sets each time due to
        # uniqueness constraints
        if Set.select().count() > 0:
            if not self.force:
                click.confirm("Remove all previously-found sets?", abort=True)
            self.workspace.reset_sets()

        self.chr_ends = locate.chromosome_ends(self.fg_genome_fp)
        # Evaluate the scoring expression from a string and return it as a
        # callable function
        self.score_fun = functools.partial(
            score.default_score_set,
            expression=self.score_expression)

        graph.build_graph(self.max_dimer_bp, GRAPH_FP)

        message(
            "Finding sets. If nothing appears, try relaxing your parameters.")

        setfinder_lines = sets.find(
            min_bg_bind_dist=self.min_bg_bind_dist,
            min_size=self.min_size,
            max_size=self.max_size,
            bg_length=self.bg_length,
            graph_fp=GRAPH_FP,
            workers=self.workers)
        self.process_lines(setfinder_lines)
Esempio n. 3
0
    def run(self):
        if self.max_sets < 1:
            self.max_sets = float("inf")
        # We need to clear all the previously-used sets each time due to
        # uniqueness constraints
        if Set.select().count() > 0:
            if not self.force:
                click.confirm("Remove all previously-found sets?", abort=True)
            self.workspace.reset_sets()

        self.chr_ends = locate.chromosome_ends(self.fg_genome_fp)
        # Evaluate the scoring expression from a string and return it as a
        # callable function
        self.score_fun = functools.partial(score.default_score_set,
                                           expression=self.score_expression)

        graph.build_graph(self.max_dimer_bp, GRAPH_FP)

        message(
            "Finding sets. If nothing appears, try relaxing your parameters.")

        setfinder_lines = sets.find(min_bg_bind_dist=self.min_bg_bind_dist,
                                    min_size=self.min_size,
                                    max_size=self.max_size,
                                    bg_length=self.bg_length,
                                    graph_fp=GRAPH_FP,
                                    workers=self.workers)
        self.process_lines(setfinder_lines)
Esempio n. 4
0
    def run(self):
        self.chr_ends = locate.chromosome_ends(self.fg_genome_fp)
        # Evaluate the scoring expression from a string and return it as a
        # callable function
        self.score_fun = functools.partial(score.default_score_set,
                                           expression=self.score_expression)

        primers = Primers(self.input)
        if len(primers) == 0:
            error("No primers specified exist in database, aborting.",
                  exception=False)

        bg_dist_mean = score.calculate_bg_dist_mean(primers, self.bg_length)

        set_score, variables, _ = score.score_set(primers=primers,
                                                  max_fg_bind_dist=0,
                                                  bg_dist_mean=bg_dist_mean,
                                                  chr_ends=self.chr_ends,
                                                  score_fun=self.score_fun,
                                                  interactive=True)

        do_add_set, set_id = self.user_add_set(set_score, variables)

        if do_add_set:
            s = workspace.Set.add(_id=set_id,
                                  primers=primers,
                                  score=set_score,
                                  scoring_fn=self.score_expression,
                                  **variables)
            set_added = s is not None

            if set_added:
                message("Set {} added successfully.".format(set_id))
            else:
                message("That primer set already exists.")
Esempio n. 5
0
 def _pprint_args(self):
     message(
         click.style("swga {}, v{}".format(self.name, __version__),
                     fg='green'))
     for arg, val in self.args.iteritems():
         if val is None or val == "":
             val = click.style("None", fg='red')
         message(click.style("  - {}: {}".format(arg, val), fg='blue'))
Esempio n. 6
0
 def filter_min_fg_rate(self, min_bind):
     """Remove primers that bind less than `min_bind` to the foreground."""
     results = Primer.select().where(
         (Primer.seq << self.primers) &
         (Primer.fg_freq >= min_bind))
     message(
         '{}/{} primers bind the foreground genome >= {} times'
         .format(results.count(), self.n, min_bind))
     return results
Esempio n. 7
0
    def filter_max_bg_rate(self, rate):
        """Remove primers that bind more than `rate` to the background genome."""
        results = Primer.select().where(
            (Primer.seq << self.primers) &
            (Primer.bg_freq <= rate))
        message(
            '{}/{} primers bind the background genome <= {} times'
            .format(results.count(), self.n, rate))

        return results
Esempio n. 8
0
 def pprint_args(self):
     if not self.args:
         return
     else:
         swga.message(colored.green("Command: " + self.name))
         with indent(2, quote='-'):
             for arg, val in self.args.iteritems():
                 if val is None or val == "":
                     val = colored.red("None")
                 swga.message(colored.blue("{}: {}".format(arg, val)))
Esempio n. 9
0
 def update_locations(self, fg_genome_fp):
     """Find binding locations for any primers that don't have them."""
     targets = list(Primer.select().where(
         (Primer.seq << self.primers) &
         (Primer._locations >> None)))
     if len(targets) > 0:
         message(
             'Finding binding locations for {} primers...'
             .format(len(targets)))
     for primer in targets:
         primer._update_locations(fg_genome_fp)
     return targets
Esempio n. 10
0
 def update_melt_temps(self):
     """Calculate melting temp for any primers that don't have it."""
     targets = list(Primer.select().where(
         (Primer.seq << self.primers) &
         (Primer.tm >> None)))
     if len(targets) > 0:
         message(
             'Finding melting temps for {} primers...'
             .format(len(targets)))
     for primer in targets:
         primer.update_tm()
     return targets
Esempio n. 11
0
 def update_gini(self, fg_genome_fp):
     """Calculate Gini coef for any primers that don't have it."""
     targets = list(Primer.select().where(
         (Primer.seq << self.primers) &
         (Primer.gini >> None)))
     if len(targets) > 0:
         message(
             'Finding Gini coefficients for {} primers...'
             .format(len(targets)))
     for primer in targets:
         primer._update_gini(fg_genome_fp)
     return targets
Esempio n. 12
0
    def process_lines(self, setfinder_lines):
        passed = processed = 0
        smallest_max_dist = float('inf')

        try:
            for line in setfinder_lines:
                try:
                    primer_ids, bg_dist_mean = score.read_set_finder_line(line)
                except ValueError:
                    warn("Could not parse line:\n\t" + line)
                    continue

                primers = Primers.select_by_ids(primer_ids)
                processed += 1

                set_score, variables, max_dist = score.score_set(
                    primers=primers,
                    max_fg_bind_dist=self.max_fg_bind_dist,
                    bg_dist_mean=bg_dist_mean,
                    chr_ends=self.chr_ends,
                    score_fun=self.score_fun,
                    interactive=False
                )

                if max_dist < smallest_max_dist:
                    smallest_max_dist = max_dist

                message(
                    STATUS_LINE.format(processed, passed, smallest_max_dist),
                    newline=False)

                # Return early if the set doesn't pass
                if set_score is False:
                    continue
                else:
                    passed += 1

                Set.add(
                    _id=passed,
                    primers=primers,
                    score=set_score,
                    scoring_fn=self.score_expression,
                    **variables)

                if passed >= self.max_sets:
                    message("\nDone (scored %i sets)" % passed)
                    break
        finally:
            # Raises a GeneratorExit inside the find_sets command, prompting it
            # to quit the subprocess
            setfinder_lines.close()
Esempio n. 13
0
    def filter_tm_range(self, min_tm, max_tm):
        """Remove primers that have melting temps outside this range.

        Finds any missing melt temps for primers.
        """
        self.update_melt_temps()
        results = Primer.select().where(
            (Primer.seq << self.primers) &
            (Primer.tm <= max_tm) &
            (Primer.tm >= min_tm))
        message(
            '{}/{} primers have a melting temp between {} and {} C'
            .format(results.count(), self.n, min_tm, max_tm))
        return results
Esempio n. 14
0
    def process_lines(self, setfinder_lines):
        passed = processed = 0
        smallest_max_dist = float('inf')

        try:
            for line in setfinder_lines:
                try:
                    primer_ids, bg_dist_mean = score.read_set_finder_line(line)
                except ValueError:
                    warn("Could not parse line:\n\t" + line)
                    continue

                primers = Primers.select_by_ids(primer_ids)
                processed += 1

                set_score, variables, max_dist = score.score_set(
                    primers=primers,
                    max_fg_bind_dist=self.max_fg_bind_dist,
                    bg_dist_mean=bg_dist_mean,
                    chr_ends=self.chr_ends,
                    score_fun=self.score_fun,
                    interactive=False)

                if max_dist < smallest_max_dist:
                    smallest_max_dist = max_dist

                message(STATUS_LINE.format(processed, passed,
                                           smallest_max_dist),
                        newline=False)

                # Return early if the set doesn't pass
                if set_score is False:
                    continue
                else:
                    passed += 1

                Set.add(_id=passed,
                        primers=primers,
                        score=set_score,
                        scoring_fn=self.score_expression,
                        **variables)

                if passed >= self.max_sets:
                    message("\nDone (scored %i sets)" % passed)
                    break
        finally:
            # Raises a GeneratorExit inside the find_sets command, prompting it
            # to quit the subprocess
            setfinder_lines.close()
Esempio n. 15
0
def build_graph(max_hetdimer_bind, outfile):
    '''Selects all active primers and outputs a primer compatibility graph.'''

    # Reset all the primer IDs (as ids are only used for set_finder)
    primers = Primers.select_active().assign_ids()
    #    print [(p._id, p.ratio) for p in primers]
    message("Composing primer compatibility graph...")
    edges = build_edges(primers, max_hetdimer_bind)

    if len(edges) == 0:
        error("No compatible primers. Try relaxing your parameters.",
              exception=False)

    with open(outfile, 'wb') as out:
        write_graph(primers, edges, out)
Esempio n. 16
0
def count_kmers(
        fg_genome_fp,
        bg_genome_fp,
        min_size,
        max_size,
        min_fg_bind,
        max_bg_bind,
        max_dimer_bp,
        primer_db,
        exclude_fp,
        exclude_threshold,
        **kwargs):
    assert os.path.isfile(fg_genome_fp)
    assert os.path.isfile(bg_genome_fp)

    check_create_tables(primer_db)
    swga.utils.mkdirp(output_dir)

    kmers = []
    for k in xrange(min_size, max_size + 1):
        fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir)
        bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir)

        if exclude_fp:
            assert os.path.isfile(exclude_fp)
            ex = swga.primers.count_kmers(
                k, exclude_fp, output_dir, exclude_threshold)
        else:
            ex = {}

        # Keep kmers found in foreground, merging bg binding values, and
        # excluding those found in the excluded fasta

        kmers = [
            primer_dict(seq, fg, bg, min_fg_bind, max_bg_bind, max_dimer_bp)
            for seq in fg.viewkeys() if seq not in ex.viewkeys()
        ]

        kmers = filter(lambda x: x != {}, kmers)

        nkmers = len(kmers)

        chunk_size = 199
        swga.message("Writing {n} {k}-mers into db in blocks of {cs}..."
                     .format(n=nkmers*2, k=k, cs=chunk_size))
        database.add_primers(kmers, chunk_size, add_revcomp=True)

    swga.message("Counted kmers in range %d-%d" % (min_size, max_size))
Esempio n. 17
0
def read_primer_list(lines, fg_genome_fp, bg_genome_fp):
    '''
    Reads in a list of primers, one per line, and returns the corresponding
    records from the primer database. If the primer doesn't exist in the db,
    tries to create it manually. If the primer doesn't appear in the fg genome,
    it skips it with a warning.
    '''
    seqs = [re.split(r'[ \t]+', line.strip('\n'))[0] for line in lines]
    primers = list(Primer.select().where(Primer.seq << seqs).execute())
    if len(primers) < len(seqs):
        primer_seqs = [p.seq for p in primers]
        missing = [_ for _ in seqs if _ not in primer_seqs]
        for seq in missing:
            swga.message(seq + " not in the database; skipping. Add it "
                         "manually with `swga count --input <file>` ")
    return primers
Esempio n. 18
0
def build_graph(max_hetdimer_bind, outfile):
    '''Selects all active primers and outputs a primer compatibility graph.'''

    # Reset all the primer IDs (as ids are only used for set_finder)
    primers = Primers.select_active().assign_ids()
#    print [(p._id, p.ratio) for p in primers]
    message("Composing primer compatibility graph...")
    edges = build_edges(primers, max_hetdimer_bind)

    if len(edges) == 0:
        error(
            "No compatible primers. Try relaxing your parameters.",
            exception=False)

    with open(outfile, 'wb') as out:
        write_graph(primers, edges, out)
Esempio n. 19
0
    def activate(self, min_active=1):
        """Activate all the primers in the list.

        :param min_active: The maximum number expected to activate. Warns if
        fewer than this number.
        """
        n = (Primer.update(active=True)
             .where(Primer.seq << self.primers)
             .execute())
        message('Marked {} primers as active.'.format(n))
        if n < min_active:
            message(
                'Note: Fewer than {} primers were selected ({} passed all the '
                'filters). You may want to try less restrictive filtering '
                'parameters.'.format(min_active, n))
        return self
Esempio n. 20
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(
                    k, self.exclude_fp, output_dir, self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp)
                for seq in fg.viewkeys() if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
Esempio n. 21
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(k, self.exclude_fp, output_dir,
                                            self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp) for seq in fg.viewkeys()
                if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" %
                (self.min_size, self.max_size))
Esempio n. 22
0
    def write(self, output_fp):
        """Writes the bedgraph to a file in a directory named after the set."""
        # Create the output folder if it doesn't exist already
        output_folder = swga.export._mk_folder(
            output_fp,
            self.fg_genome_fp,
            "set_%s" % self.set._id)

        bedgraph_fp = os.path.join(
            output_folder,
            "set_{}.bedgraph".format(self.set._id))

        with open(bedgraph_fp, 'wb') as bedgraph_file:
            typestr = "track type=bedGraph {}\n".format(self.opts_str)
            bedgraph_file.write(typestr)
            for record_name, midpoint, hits in self._hits_per_record():
                linestr = "{} {} {} {}\n".format(
                    record_name, midpoint, midpoint, hits)
                bedgraph_file.write(linestr)

        swga.message("Bedfile written to {}".format(bedgraph_fp))
Esempio n. 23
0
 def user_add_set(self, set_score, variables):
     """Output set statistics and prompt the user to add the set."""
     set_dict = dict({
         'score': set_score,
         'scoring_fn': self.score_expression
     }.items() + variables.items())
     message("Set statistics:\n - " +
             "\n - ".join(utils.fmtkv(k, v) for k, v in set_dict.items()))
     if self.force or (not self.force and click.confirm(
             "Add set to database?", default=True)):
         # User-provided sets have negative numbers, so we find the
         # smallest and decrement by 1
         min_set_id = Set.select(fn.Min(Set._id)).scalar()
         # This is None if there are no other sets yet
         if min_set_id is None:
             min_set_id = 0
         set_id = min_set_id - 1
         add_set = True
     else:
         add_set = False
     return add_set, set_id
Esempio n. 24
0
 def user_add_set(self, set_score, variables):
     """Output set statistics and prompt the user to add the set."""
     set_dict = dict(
         {'score': set_score,
          'scoring_fn': self.score_expression}.items() +
         variables.items())
     message("Set statistics:\n - " + "\n - ".join(
         utils.fmtkv(k, v) for k, v in set_dict.items()))
     if self.force or (not self.force and click.confirm("Add set to database?",
                                                        default=True)):
         # User-provided sets have negative numbers, so we find the
         # smallest and decrement by 1
         min_set_id = Set.select(fn.Min(Set._id)).scalar()
         # This is None if there are no other sets yet
         if min_set_id is None:
             min_set_id = 0
         set_id = min_set_id - 1
         add_set = True
     else:
         add_set = False
     return add_set, set_id
Esempio n. 25
0
    def run(self):
        self.chr_ends = locate.chromosome_ends(self.fg_genome_fp)
        # Evaluate the scoring expression from a string and return it as a
        # callable function
        self.score_fun = functools.partial(
            score.default_score_set,
            expression=self.score_expression)

        primers = Primers(self.input)
        if len(primers) == 0:
            error(
                "No primers specified exist in database, aborting.",
                exception=False)

        bg_dist_mean = score.calculate_bg_dist_mean(primers, self.bg_length)

        set_score, variables, _ = score.score_set(
            primers=primers,
            max_fg_bind_dist=0,
            bg_dist_mean=bg_dist_mean,
            chr_ends=self.chr_ends,
            score_fun=self.score_fun,
            interactive=True
        )

        do_add_set, set_id = self.user_add_set(set_score, variables)

        if do_add_set:
            s = workspace.Set.add(
                _id=set_id,
                primers=primers,
                score=set_score,
                scoring_fn=self.score_expression,
                **variables)
            set_added = s is not None

            if set_added:
                message("Set {} added successfully.".format(set_id))
            else:
                message("That primer set already exists.")
Esempio n. 26
0
def count_kmers(k, genome_fp, cwd, threshold=1):
    assert isinstance(threshold, int)
    dsk = resources.get_dsk()
    genome = genome_fp.split(os.sep).pop()
    out = '%s-%dmers' % (genome, k)
    outfile = os.path.join(cwd, out + '.solid_kmers_binary')
    if os.path.isfile(outfile):
        swga.message("Binary kmer file already exists at %s; parsing..."
                     % outfile)
    else:
        cmdarr = [dsk, genome_fp, str(k), '-o', out, '-t', str(threshold)]
        cmdstr = " ".join(cmdarr)
        swga.message("In {cwd}:\n> {cmdstr}".format(**locals()))
        try:
            subprocess.check_call(cmdarr, cwd=cwd)
        except:
            if os.path.isfile(outfile):
                os.remove(outfile)
            raise
    primers = dict((kmer, freq)
                   for kmer, freq in _parse_kmer_binary(outfile)
                   if freq >= threshold)
    return primers
Esempio n. 27
0
def count_kmers(k, genome_fp, cwd, threshold=1):
    '''Counts k-mers in the specified genome.

    :param k: the number of nucleotides in the k-mers
    :param genome_fp: the file path to the genome/fasta file
    :param cwd: the current working directory (to store intermediate cache)
    :param threshold: the minimum k-mer frequency
    '''

    assert isinstance(threshold, int)

    genome = genome_fp.split(os.sep).pop()
    out = '%s-%dmers' % (genome, k)
    outfile = os.path.join(cwd, out + '.solid_kmers_binary')

    if os.path.isfile(outfile):
        message("Binary kmer file already exists at %s; parsing..." % outfile)
    else:
        cmdarr = [
            swga.utils.dsk(), genome_fp,
            str(k), '-o', out, '-t',
            str(threshold)
        ]
        cmdstr = " ".join(cmdarr)
        message("In {cwd}:\n> {cmdstr}".format(**locals()))
        try:
            subprocess.check_call(cmdarr, cwd=cwd)
        except:
            if os.path.isfile(outfile):
                os.remove(outfile)
            raise

    primers = dict((kmer, freq) for kmer, freq in _parse_kmer_binary(outfile)
                   if freq >= threshold)

    return primers
Esempio n. 28
0
    def filter_max_gini(self, gini_max, fg_genome_fp):
        """Remove primers with Gini coefficients less than `gini_max`.

        Finds binding locations and Gini coefficients for primers that do not
        have them already.

        :param gini_max: max Gini coefficient (0-1)
        """
        if 0 > gini_max > 1:
            raise ValueError('Gini coefficient must be between 0-1')

        (self
         .update_locations(fg_genome_fp)
         .update_gini(fg_genome_fp))

        results = Primer.select().where(
            (Primer.seq << self.primers) &
            (Primer.gini <= gini_max))

        message(
            '{}/{} primers have a Gini coefficient <= {}'
            .format(results.count(), self.n, gini_max))

        return results
Esempio n. 29
0
def count_specific_kmers(
        kmers,
        fg_genome_fp,
        bg_genome_fp,
        primer_db,
        **kwargs):

    try:
        # Skip primers that already exist and warn users
        existing = [p.seq for p in Primer.select().where(Primer.seq << kmers)]
        for p in existing:
            swga.message("{} already exists in db, skipping...".format(p))
        kmers = filter(lambda p: p not in existing, kmers)
    except OperationalError:
        # If this fails due to an OperationalError, it probably means the
        # database tables haven't been created yet
        check_create_tables(primer_db)
        swga.utils.mkdirp(output_dir)

    # Group the kmers by length to avoid repeatedly counting kmers of the same size
    kmers_by_length = defaultdict(list)
    for kmer in kmers:
        kmers_by_length[len(kmer)].append(kmer)

    for k, mers in kmers_by_length.items():
        fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir, 1)
        bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir, 1)
        primers = []
        for mer in mers:
            try:
                primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
            except KeyError:
                swga.message(
                    "{} does not exist in foreground genome, skipping..."
                    .format(mer))

        # Omitting any primers that were returned empty
        # primers = filter(lambda p: p == {}, primers)
        chunk_size = 199
        swga.message(
            "Writing {n} {k}-mers into db in blocks of {cs}..."
            .format(n=len(primers), k=k, cs=chunk_size))
        database.add_primers(primers, chunk_size, add_revcomp=False)
Esempio n. 30
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet. 
            error(
                "It doesn't appear that the workspace has been initialized: "
                "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping..."
                        .format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Esempio n. 31
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet.
            error("It doesn't appear that the workspace has been initialized: "
                  "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping...".
                        format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Esempio n. 32
0
 def summarize(self):
     """Output the number of primers currently in list."""
     message('{} primers satisfy all filters so far.'.format(self.n))
     return self