Ejemplo n.º 1
0
    def run(self):
        # If we have an input file, use that. Otherwise pull from db
        if self.input:
            with open(self.input, 'rb') as infile:
                primers = Primers(infile)
        else:
            self.skip_filtering = False
            primers = Primers()

        assert isinstance(primers, Primers)

        # Undo all active marks, if any
        Primer.update(active=False).execute()

        if not self.skip_filtering:
            (
                primers
                .filter_min_fg_rate(self.min_fg_bind)
                .filter_max_bg_rate(self.max_bg_bind)
                .summarize()
                .filter_tm_range(self.min_tm, self.max_tm)
                .limit_to(self.max_primers)
                .filter_max_gini(self.max_gini, self.fg_genome_fp)
            )

        primers.activate(self.max_primers)
Ejemplo n.º 2
0
    def run(self):
        primers = Primers(self.input)

        try:
            (primers.update_melt_temps().update_locations(
                self.fg_genome_fp).activate())
        except AttributeError as e:
            warn("Error updating database: '{}'".format(e.message))
            raise e
Ejemplo n.º 3
0
    def run(self):
        self.chr_ends = locate.chromosome_ends(self.fg_genome_fp)
        # Evaluate the scoring expression from a string and return it as a
        # callable function
        self.score_fun = functools.partial(score.default_score_set,
                                           expression=self.score_expression)

        primers = Primers(self.input)
        if len(primers) == 0:
            error("No primers specified exist in database, aborting.",
                  exception=False)

        bg_dist_mean = score.calculate_bg_dist_mean(primers, self.bg_length)

        set_score, variables, _ = score.score_set(primers=primers,
                                                  max_fg_bind_dist=0,
                                                  bg_dist_mean=bg_dist_mean,
                                                  chr_ends=self.chr_ends,
                                                  score_fun=self.score_fun,
                                                  interactive=True)

        do_add_set, set_id = self.user_add_set(set_score, variables)

        if do_add_set:
            s = workspace.Set.add(_id=set_id,
                                  primers=primers,
                                  score=set_score,
                                  scoring_fn=self.score_expression,
                                  **variables)
            set_added = s is not None

            if set_added:
                message("Set {} added successfully.".format(set_id))
            else:
                message("That primer set already exists.")
Ejemplo n.º 4
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(k, self.exclude_fp, output_dir,
                                            self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp) for seq in fg.viewkeys()
                if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" %
                (self.min_size, self.max_size))
Ejemplo n.º 5
0
    def count_kmers(self):

        # We need to clear all previous primers each time due to uniqueness
        # constraints
        if Primer.select().count() > 0:
            if not self.force:
                click.confirm(
                    "Remove all previously-found primers and re-count?",
                    abort=True)
            self.workspace.reset_primers()

        mkdirp(output_dir)

        kmers = []
        for k in xrange(self.min_size, self.max_size + 1):
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir)

            if self.exclude_fp:
                assert os.path.isfile(self.exclude_fp)
                ex = swga.kmers.count_kmers(
                    k, self.exclude_fp, output_dir, self.exclude_threshold)
            else:
                ex = {}

            # Keep kmers found in foreground, merging bg binding values, and
            # excluding those found in the excluded fasta

            kmers = [
                primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind,
                            self.max_dimer_bp)
                for seq in fg.viewkeys() if seq not in ex.viewkeys()
            ]

            kmers = filter(lambda x: x != {}, kmers)

            nkmers = len(kmers)

            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=nkmers * 2, k=k, cs=chunk_size))
            Primers.add(kmers, add_revcomp=True)

        message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
Ejemplo n.º 6
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet. 
            error(
                "It doesn't appear that the workspace has been initialized: "
                "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping..."
                        .format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message(
                "Writing {n} {k}-mers into db in blocks of {cs}..."
                .format(n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Ejemplo n.º 7
0
    def process_lines(self, setfinder_lines):
        passed = processed = 0
        smallest_max_dist = float('inf')

        try:
            for line in setfinder_lines:
                try:
                    primer_ids, bg_dist_mean = score.read_set_finder_line(line)
                except ValueError:
                    warn("Could not parse line:\n\t" + line)
                    continue

                primers = Primers.select_by_ids(primer_ids)
                processed += 1

                set_score, variables, max_dist = score.score_set(
                    primers=primers,
                    max_fg_bind_dist=self.max_fg_bind_dist,
                    bg_dist_mean=bg_dist_mean,
                    chr_ends=self.chr_ends,
                    score_fun=self.score_fun,
                    interactive=False
                )

                if max_dist < smallest_max_dist:
                    smallest_max_dist = max_dist

                message(
                    STATUS_LINE.format(processed, passed, smallest_max_dist),
                    newline=False)

                # Return early if the set doesn't pass
                if set_score is False:
                    continue
                else:
                    passed += 1

                Set.add(
                    _id=passed,
                    primers=primers,
                    score=set_score,
                    scoring_fn=self.score_expression,
                    **variables)

                if passed >= self.max_sets:
                    message("\nDone (scored %i sets)" % passed)
                    break
        finally:
            # Raises a GeneratorExit inside the find_sets command, prompting it
            # to quit the subprocess
            setfinder_lines.close()
Ejemplo n.º 8
0
    def count_specific_kmers(self, kmers):
        try:
            # Skip primers that already exist and warn users
            existing = Primers.select_by_seqs(kmers)
            for p in existing:
                message("{} already exists in db, skipping...".format(p))
            kmers = [p for p in kmers if p not in existing]
        except OperationalError:
            # If this fails due to an OperationalError, it probably means the
            # database tables haven't been created yet.
            error("It doesn't appear that the workspace has been initialized: "
                  "run `swga init' first.")
        mkdirp(output_dir)

        # Group the kmers by length to avoid repeatedly counting kmers of the
        # same size
        kmers_by_length = defaultdict(list)
        for kmer in kmers:
            kmers_by_length[len(kmer)].append(kmer)

        for k, mers in kmers_by_length.items():
            fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1)
            bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1)
            primers = []
            for mer in mers:
                try:
                    primers.append(primer_dict(mer, fg, bg, 0, INF, INF))
                except KeyError:
                    message(
                        "{} does not exist in foreground genome, skipping...".
                        format(mer))

            # Omitting any primers that were returned empty
            # primers = filter(lambda p: p == {}, primers)
            chunk_size = 199
            message("Writing {n} {k}-mers into db in blocks of {cs}...".format(
                n=len(primers), k=k, cs=chunk_size))
            Primers.add(primers, add_revcomp=False)
Ejemplo n.º 9
0
    def process_lines(self, setfinder_lines):
        passed = processed = 0
        smallest_max_dist = float('inf')

        try:
            for line in setfinder_lines:
                try:
                    primer_ids, bg_dist_mean = score.read_set_finder_line(line)
                except ValueError:
                    warn("Could not parse line:\n\t" + line)
                    continue

                primers = Primers.select_by_ids(primer_ids)
                processed += 1

                set_score, variables, max_dist = score.score_set(
                    primers=primers,
                    max_fg_bind_dist=self.max_fg_bind_dist,
                    bg_dist_mean=bg_dist_mean,
                    chr_ends=self.chr_ends,
                    score_fun=self.score_fun,
                    interactive=False)

                if max_dist < smallest_max_dist:
                    smallest_max_dist = max_dist

                message(STATUS_LINE.format(processed, passed,
                                           smallest_max_dist),
                        newline=False)

                # Return early if the set doesn't pass
                if set_score is False:
                    continue
                else:
                    passed += 1

                Set.add(_id=passed,
                        primers=primers,
                        score=set_score,
                        scoring_fn=self.score_expression,
                        **variables)

                if passed >= self.max_sets:
                    message("\nDone (scored %i sets)" % passed)
                    break
        finally:
            # Raises a GeneratorExit inside the find_sets command, prompting it
            # to quit the subprocess
            setfinder_lines.close()
Ejemplo n.º 10
0
def build_graph(max_hetdimer_bind, outfile):
    '''Selects all active primers and outputs a primer compatibility graph.'''

    # Reset all the primer IDs (as ids are only used for set_finder)
    primers = Primers.select_active().assign_ids()
    #    print [(p._id, p.ratio) for p in primers]
    message("Composing primer compatibility graph...")
    edges = build_edges(primers, max_hetdimer_bind)

    if len(edges) == 0:
        error("No compatible primers. Try relaxing your parameters.",
              exception=False)

    with open(outfile, 'wb') as out:
        write_graph(primers, edges, out)
Ejemplo n.º 11
0
def build_graph(max_hetdimer_bind, outfile):
    '''Selects all active primers and outputs a primer compatibility graph.'''

    # Reset all the primer IDs (as ids are only used for set_finder)
    primers = Primers.select_active().assign_ids()
#    print [(p._id, p.ratio) for p in primers]
    message("Composing primer compatibility graph...")
    edges = build_edges(primers, max_hetdimer_bind)

    if len(edges) == 0:
        error(
            "No compatible primers. Try relaxing your parameters.",
            exception=False)

    with open(outfile, 'wb') as out:
        write_graph(primers, edges, out)
Ejemplo n.º 12
0
    def run(self):
        # If we have an input file, use that. Otherwise pull from db
        if self.input:
            with open(self.input, 'rb') as infile:
                primers = Primers(infile)
        else:
            self.skip_filtering = False
            primers = Primers()

        assert isinstance(primers, Primers)

        # Undo all active marks, if any
        Primer.update(active=False).execute()

        if not self.skip_filtering:
            (primers.filter_min_fg_rate(self.min_fg_bind).filter_max_bg_rate(
                self.max_bg_bind).summarize().filter_tm_range(
                    self.min_tm,
                    self.max_tm).limit_to(self.max_primers).filter_max_gini(
                        self.max_gini, self.fg_genome_fp))

        primers.activate(self.max_primers)
Ejemplo n.º 13
0
 def test_add_primers(self):
     '''Must add the reverse complement of a primer if requested.'''
     primers = [{'seq': "AAAA"}]
     Primers.add(primers, add_revcomp=True)
     assert Primer.select().where(Primer.seq == "TTTT").count() == 1