def chunk_iterator(itr, fn, n=100, show_progress=True, label=None): '''Breaks an iterable into chunks and applies a function to each chunk. Arguments: - itr the iterable to be chunked - fn the function to be applied to each chunks - n the size of each chunk - show_progress show a progress bar - label the label to show on the progress bar ''' if len(itr) == 0: return label = "" if label is None else label if len(itr)/n <= 1: show_progress = False swga.message(label) chunked_itr = chunks(itr, n) if show_progress: chunked = progress.bar( chunked_itr, label=label, expected_size=max(len(itr)/n, 1)) else: chunked = chunked_itr for chunk in chunked: fn(chunk)
def run(self): if self.max_sets < 1: self.max_sets = float("inf") # We need to clear all the previously-used sets each time due to # uniqueness constraints if Set.select().count() > 0: if not self.force: click.confirm("Remove all previously-found sets?", abort=True) self.workspace.reset_sets() self.chr_ends = locate.chromosome_ends(self.fg_genome_fp) # Evaluate the scoring expression from a string and return it as a # callable function self.score_fun = functools.partial( score.default_score_set, expression=self.score_expression) graph.build_graph(self.max_dimer_bp, GRAPH_FP) message( "Finding sets. If nothing appears, try relaxing your parameters.") setfinder_lines = sets.find( min_bg_bind_dist=self.min_bg_bind_dist, min_size=self.min_size, max_size=self.max_size, bg_length=self.bg_length, graph_fp=GRAPH_FP, workers=self.workers) self.process_lines(setfinder_lines)
def run(self): if self.max_sets < 1: self.max_sets = float("inf") # We need to clear all the previously-used sets each time due to # uniqueness constraints if Set.select().count() > 0: if not self.force: click.confirm("Remove all previously-found sets?", abort=True) self.workspace.reset_sets() self.chr_ends = locate.chromosome_ends(self.fg_genome_fp) # Evaluate the scoring expression from a string and return it as a # callable function self.score_fun = functools.partial(score.default_score_set, expression=self.score_expression) graph.build_graph(self.max_dimer_bp, GRAPH_FP) message( "Finding sets. If nothing appears, try relaxing your parameters.") setfinder_lines = sets.find(min_bg_bind_dist=self.min_bg_bind_dist, min_size=self.min_size, max_size=self.max_size, bg_length=self.bg_length, graph_fp=GRAPH_FP, workers=self.workers) self.process_lines(setfinder_lines)
def run(self): self.chr_ends = locate.chromosome_ends(self.fg_genome_fp) # Evaluate the scoring expression from a string and return it as a # callable function self.score_fun = functools.partial(score.default_score_set, expression=self.score_expression) primers = Primers(self.input) if len(primers) == 0: error("No primers specified exist in database, aborting.", exception=False) bg_dist_mean = score.calculate_bg_dist_mean(primers, self.bg_length) set_score, variables, _ = score.score_set(primers=primers, max_fg_bind_dist=0, bg_dist_mean=bg_dist_mean, chr_ends=self.chr_ends, score_fun=self.score_fun, interactive=True) do_add_set, set_id = self.user_add_set(set_score, variables) if do_add_set: s = workspace.Set.add(_id=set_id, primers=primers, score=set_score, scoring_fn=self.score_expression, **variables) set_added = s is not None if set_added: message("Set {} added successfully.".format(set_id)) else: message("That primer set already exists.")
def _pprint_args(self): message( click.style("swga {}, v{}".format(self.name, __version__), fg='green')) for arg, val in self.args.iteritems(): if val is None or val == "": val = click.style("None", fg='red') message(click.style(" - {}: {}".format(arg, val), fg='blue'))
def filter_min_fg_rate(self, min_bind): """Remove primers that bind less than `min_bind` to the foreground.""" results = Primer.select().where( (Primer.seq << self.primers) & (Primer.fg_freq >= min_bind)) message( '{}/{} primers bind the foreground genome >= {} times' .format(results.count(), self.n, min_bind)) return results
def filter_max_bg_rate(self, rate): """Remove primers that bind more than `rate` to the background genome.""" results = Primer.select().where( (Primer.seq << self.primers) & (Primer.bg_freq <= rate)) message( '{}/{} primers bind the background genome <= {} times' .format(results.count(), self.n, rate)) return results
def pprint_args(self): if not self.args: return else: swga.message(colored.green("Command: " + self.name)) with indent(2, quote='-'): for arg, val in self.args.iteritems(): if val is None or val == "": val = colored.red("None") swga.message(colored.blue("{}: {}".format(arg, val)))
def update_locations(self, fg_genome_fp): """Find binding locations for any primers that don't have them.""" targets = list(Primer.select().where( (Primer.seq << self.primers) & (Primer._locations >> None))) if len(targets) > 0: message( 'Finding binding locations for {} primers...' .format(len(targets))) for primer in targets: primer._update_locations(fg_genome_fp) return targets
def update_melt_temps(self): """Calculate melting temp for any primers that don't have it.""" targets = list(Primer.select().where( (Primer.seq << self.primers) & (Primer.tm >> None))) if len(targets) > 0: message( 'Finding melting temps for {} primers...' .format(len(targets))) for primer in targets: primer.update_tm() return targets
def update_gini(self, fg_genome_fp): """Calculate Gini coef for any primers that don't have it.""" targets = list(Primer.select().where( (Primer.seq << self.primers) & (Primer.gini >> None))) if len(targets) > 0: message( 'Finding Gini coefficients for {} primers...' .format(len(targets))) for primer in targets: primer._update_gini(fg_genome_fp) return targets
def process_lines(self, setfinder_lines): passed = processed = 0 smallest_max_dist = float('inf') try: for line in setfinder_lines: try: primer_ids, bg_dist_mean = score.read_set_finder_line(line) except ValueError: warn("Could not parse line:\n\t" + line) continue primers = Primers.select_by_ids(primer_ids) processed += 1 set_score, variables, max_dist = score.score_set( primers=primers, max_fg_bind_dist=self.max_fg_bind_dist, bg_dist_mean=bg_dist_mean, chr_ends=self.chr_ends, score_fun=self.score_fun, interactive=False ) if max_dist < smallest_max_dist: smallest_max_dist = max_dist message( STATUS_LINE.format(processed, passed, smallest_max_dist), newline=False) # Return early if the set doesn't pass if set_score is False: continue else: passed += 1 Set.add( _id=passed, primers=primers, score=set_score, scoring_fn=self.score_expression, **variables) if passed >= self.max_sets: message("\nDone (scored %i sets)" % passed) break finally: # Raises a GeneratorExit inside the find_sets command, prompting it # to quit the subprocess setfinder_lines.close()
def filter_tm_range(self, min_tm, max_tm): """Remove primers that have melting temps outside this range. Finds any missing melt temps for primers. """ self.update_melt_temps() results = Primer.select().where( (Primer.seq << self.primers) & (Primer.tm <= max_tm) & (Primer.tm >= min_tm)) message( '{}/{} primers have a melting temp between {} and {} C' .format(results.count(), self.n, min_tm, max_tm)) return results
def process_lines(self, setfinder_lines): passed = processed = 0 smallest_max_dist = float('inf') try: for line in setfinder_lines: try: primer_ids, bg_dist_mean = score.read_set_finder_line(line) except ValueError: warn("Could not parse line:\n\t" + line) continue primers = Primers.select_by_ids(primer_ids) processed += 1 set_score, variables, max_dist = score.score_set( primers=primers, max_fg_bind_dist=self.max_fg_bind_dist, bg_dist_mean=bg_dist_mean, chr_ends=self.chr_ends, score_fun=self.score_fun, interactive=False) if max_dist < smallest_max_dist: smallest_max_dist = max_dist message(STATUS_LINE.format(processed, passed, smallest_max_dist), newline=False) # Return early if the set doesn't pass if set_score is False: continue else: passed += 1 Set.add(_id=passed, primers=primers, score=set_score, scoring_fn=self.score_expression, **variables) if passed >= self.max_sets: message("\nDone (scored %i sets)" % passed) break finally: # Raises a GeneratorExit inside the find_sets command, prompting it # to quit the subprocess setfinder_lines.close()
def build_graph(max_hetdimer_bind, outfile): '''Selects all active primers and outputs a primer compatibility graph.''' # Reset all the primer IDs (as ids are only used for set_finder) primers = Primers.select_active().assign_ids() # print [(p._id, p.ratio) for p in primers] message("Composing primer compatibility graph...") edges = build_edges(primers, max_hetdimer_bind) if len(edges) == 0: error("No compatible primers. Try relaxing your parameters.", exception=False) with open(outfile, 'wb') as out: write_graph(primers, edges, out)
def count_kmers( fg_genome_fp, bg_genome_fp, min_size, max_size, min_fg_bind, max_bg_bind, max_dimer_bp, primer_db, exclude_fp, exclude_threshold, **kwargs): assert os.path.isfile(fg_genome_fp) assert os.path.isfile(bg_genome_fp) check_create_tables(primer_db) swga.utils.mkdirp(output_dir) kmers = [] for k in xrange(min_size, max_size + 1): fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir) bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir) if exclude_fp: assert os.path.isfile(exclude_fp) ex = swga.primers.count_kmers( k, exclude_fp, output_dir, exclude_threshold) else: ex = {} # Keep kmers found in foreground, merging bg binding values, and # excluding those found in the excluded fasta kmers = [ primer_dict(seq, fg, bg, min_fg_bind, max_bg_bind, max_dimer_bp) for seq in fg.viewkeys() if seq not in ex.viewkeys() ] kmers = filter(lambda x: x != {}, kmers) nkmers = len(kmers) chunk_size = 199 swga.message("Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=nkmers*2, k=k, cs=chunk_size)) database.add_primers(kmers, chunk_size, add_revcomp=True) swga.message("Counted kmers in range %d-%d" % (min_size, max_size))
def read_primer_list(lines, fg_genome_fp, bg_genome_fp): ''' Reads in a list of primers, one per line, and returns the corresponding records from the primer database. If the primer doesn't exist in the db, tries to create it manually. If the primer doesn't appear in the fg genome, it skips it with a warning. ''' seqs = [re.split(r'[ \t]+', line.strip('\n'))[0] for line in lines] primers = list(Primer.select().where(Primer.seq << seqs).execute()) if len(primers) < len(seqs): primer_seqs = [p.seq for p in primers] missing = [_ for _ in seqs if _ not in primer_seqs] for seq in missing: swga.message(seq + " not in the database; skipping. Add it " "manually with `swga count --input <file>` ") return primers
def build_graph(max_hetdimer_bind, outfile): '''Selects all active primers and outputs a primer compatibility graph.''' # Reset all the primer IDs (as ids are only used for set_finder) primers = Primers.select_active().assign_ids() # print [(p._id, p.ratio) for p in primers] message("Composing primer compatibility graph...") edges = build_edges(primers, max_hetdimer_bind) if len(edges) == 0: error( "No compatible primers. Try relaxing your parameters.", exception=False) with open(outfile, 'wb') as out: write_graph(primers, edges, out)
def activate(self, min_active=1): """Activate all the primers in the list. :param min_active: The maximum number expected to activate. Warns if fewer than this number. """ n = (Primer.update(active=True) .where(Primer.seq << self.primers) .execute()) message('Marked {} primers as active.'.format(n)) if n < min_active: message( 'Note: Fewer than {} primers were selected ({} passed all the ' 'filters). You may want to try less restrictive filtering ' 'parameters.'.format(min_active, n)) return self
def count_kmers(self): # We need to clear all previous primers each time due to uniqueness # constraints if Primer.select().count() > 0: if not self.force: click.confirm( "Remove all previously-found primers and re-count?", abort=True) self.workspace.reset_primers() mkdirp(output_dir) kmers = [] for k in xrange(self.min_size, self.max_size + 1): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir) if self.exclude_fp: assert os.path.isfile(self.exclude_fp) ex = swga.kmers.count_kmers( k, self.exclude_fp, output_dir, self.exclude_threshold) else: ex = {} # Keep kmers found in foreground, merging bg binding values, and # excluding those found in the excluded fasta kmers = [ primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind, self.max_dimer_bp) for seq in fg.viewkeys() if seq not in ex.viewkeys() ] kmers = filter(lambda x: x != {}, kmers) nkmers = len(kmers) chunk_size = 199 message( "Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=nkmers * 2, k=k, cs=chunk_size)) Primers.add(kmers, add_revcomp=True) message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
def count_kmers(self): # We need to clear all previous primers each time due to uniqueness # constraints if Primer.select().count() > 0: if not self.force: click.confirm( "Remove all previously-found primers and re-count?", abort=True) self.workspace.reset_primers() mkdirp(output_dir) kmers = [] for k in xrange(self.min_size, self.max_size + 1): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir) if self.exclude_fp: assert os.path.isfile(self.exclude_fp) ex = swga.kmers.count_kmers(k, self.exclude_fp, output_dir, self.exclude_threshold) else: ex = {} # Keep kmers found in foreground, merging bg binding values, and # excluding those found in the excluded fasta kmers = [ primer_dict(seq, fg, bg, self.min_fg_bind, self.max_bg_bind, self.max_dimer_bp) for seq in fg.viewkeys() if seq not in ex.viewkeys() ] kmers = filter(lambda x: x != {}, kmers) nkmers = len(kmers) chunk_size = 199 message("Writing {n} {k}-mers into db in blocks of {cs}...".format( n=nkmers * 2, k=k, cs=chunk_size)) Primers.add(kmers, add_revcomp=True) message("Counted kmers in range %d-%d" % (self.min_size, self.max_size))
def write(self, output_fp): """Writes the bedgraph to a file in a directory named after the set.""" # Create the output folder if it doesn't exist already output_folder = swga.export._mk_folder( output_fp, self.fg_genome_fp, "set_%s" % self.set._id) bedgraph_fp = os.path.join( output_folder, "set_{}.bedgraph".format(self.set._id)) with open(bedgraph_fp, 'wb') as bedgraph_file: typestr = "track type=bedGraph {}\n".format(self.opts_str) bedgraph_file.write(typestr) for record_name, midpoint, hits in self._hits_per_record(): linestr = "{} {} {} {}\n".format( record_name, midpoint, midpoint, hits) bedgraph_file.write(linestr) swga.message("Bedfile written to {}".format(bedgraph_fp))
def user_add_set(self, set_score, variables): """Output set statistics and prompt the user to add the set.""" set_dict = dict({ 'score': set_score, 'scoring_fn': self.score_expression }.items() + variables.items()) message("Set statistics:\n - " + "\n - ".join(utils.fmtkv(k, v) for k, v in set_dict.items())) if self.force or (not self.force and click.confirm( "Add set to database?", default=True)): # User-provided sets have negative numbers, so we find the # smallest and decrement by 1 min_set_id = Set.select(fn.Min(Set._id)).scalar() # This is None if there are no other sets yet if min_set_id is None: min_set_id = 0 set_id = min_set_id - 1 add_set = True else: add_set = False return add_set, set_id
def user_add_set(self, set_score, variables): """Output set statistics and prompt the user to add the set.""" set_dict = dict( {'score': set_score, 'scoring_fn': self.score_expression}.items() + variables.items()) message("Set statistics:\n - " + "\n - ".join( utils.fmtkv(k, v) for k, v in set_dict.items())) if self.force or (not self.force and click.confirm("Add set to database?", default=True)): # User-provided sets have negative numbers, so we find the # smallest and decrement by 1 min_set_id = Set.select(fn.Min(Set._id)).scalar() # This is None if there are no other sets yet if min_set_id is None: min_set_id = 0 set_id = min_set_id - 1 add_set = True else: add_set = False return add_set, set_id
def run(self): self.chr_ends = locate.chromosome_ends(self.fg_genome_fp) # Evaluate the scoring expression from a string and return it as a # callable function self.score_fun = functools.partial( score.default_score_set, expression=self.score_expression) primers = Primers(self.input) if len(primers) == 0: error( "No primers specified exist in database, aborting.", exception=False) bg_dist_mean = score.calculate_bg_dist_mean(primers, self.bg_length) set_score, variables, _ = score.score_set( primers=primers, max_fg_bind_dist=0, bg_dist_mean=bg_dist_mean, chr_ends=self.chr_ends, score_fun=self.score_fun, interactive=True ) do_add_set, set_id = self.user_add_set(set_score, variables) if do_add_set: s = workspace.Set.add( _id=set_id, primers=primers, score=set_score, scoring_fn=self.score_expression, **variables) set_added = s is not None if set_added: message("Set {} added successfully.".format(set_id)) else: message("That primer set already exists.")
def count_kmers(k, genome_fp, cwd, threshold=1): assert isinstance(threshold, int) dsk = resources.get_dsk() genome = genome_fp.split(os.sep).pop() out = '%s-%dmers' % (genome, k) outfile = os.path.join(cwd, out + '.solid_kmers_binary') if os.path.isfile(outfile): swga.message("Binary kmer file already exists at %s; parsing..." % outfile) else: cmdarr = [dsk, genome_fp, str(k), '-o', out, '-t', str(threshold)] cmdstr = " ".join(cmdarr) swga.message("In {cwd}:\n> {cmdstr}".format(**locals())) try: subprocess.check_call(cmdarr, cwd=cwd) except: if os.path.isfile(outfile): os.remove(outfile) raise primers = dict((kmer, freq) for kmer, freq in _parse_kmer_binary(outfile) if freq >= threshold) return primers
def count_kmers(k, genome_fp, cwd, threshold=1): '''Counts k-mers in the specified genome. :param k: the number of nucleotides in the k-mers :param genome_fp: the file path to the genome/fasta file :param cwd: the current working directory (to store intermediate cache) :param threshold: the minimum k-mer frequency ''' assert isinstance(threshold, int) genome = genome_fp.split(os.sep).pop() out = '%s-%dmers' % (genome, k) outfile = os.path.join(cwd, out + '.solid_kmers_binary') if os.path.isfile(outfile): message("Binary kmer file already exists at %s; parsing..." % outfile) else: cmdarr = [ swga.utils.dsk(), genome_fp, str(k), '-o', out, '-t', str(threshold) ] cmdstr = " ".join(cmdarr) message("In {cwd}:\n> {cmdstr}".format(**locals())) try: subprocess.check_call(cmdarr, cwd=cwd) except: if os.path.isfile(outfile): os.remove(outfile) raise primers = dict((kmer, freq) for kmer, freq in _parse_kmer_binary(outfile) if freq >= threshold) return primers
def filter_max_gini(self, gini_max, fg_genome_fp): """Remove primers with Gini coefficients less than `gini_max`. Finds binding locations and Gini coefficients for primers that do not have them already. :param gini_max: max Gini coefficient (0-1) """ if 0 > gini_max > 1: raise ValueError('Gini coefficient must be between 0-1') (self .update_locations(fg_genome_fp) .update_gini(fg_genome_fp)) results = Primer.select().where( (Primer.seq << self.primers) & (Primer.gini <= gini_max)) message( '{}/{} primers have a Gini coefficient <= {}' .format(results.count(), self.n, gini_max)) return results
def count_specific_kmers( kmers, fg_genome_fp, bg_genome_fp, primer_db, **kwargs): try: # Skip primers that already exist and warn users existing = [p.seq for p in Primer.select().where(Primer.seq << kmers)] for p in existing: swga.message("{} already exists in db, skipping...".format(p)) kmers = filter(lambda p: p not in existing, kmers) except OperationalError: # If this fails due to an OperationalError, it probably means the # database tables haven't been created yet check_create_tables(primer_db) swga.utils.mkdirp(output_dir) # Group the kmers by length to avoid repeatedly counting kmers of the same size kmers_by_length = defaultdict(list) for kmer in kmers: kmers_by_length[len(kmer)].append(kmer) for k, mers in kmers_by_length.items(): fg = swga.primers.count_kmers(k, fg_genome_fp, output_dir, 1) bg = swga.primers.count_kmers(k, bg_genome_fp, output_dir, 1) primers = [] for mer in mers: try: primers.append(primer_dict(mer, fg, bg, 0, INF, INF)) except KeyError: swga.message( "{} does not exist in foreground genome, skipping..." .format(mer)) # Omitting any primers that were returned empty # primers = filter(lambda p: p == {}, primers) chunk_size = 199 swga.message( "Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=len(primers), k=k, cs=chunk_size)) database.add_primers(primers, chunk_size, add_revcomp=False)
def count_specific_kmers(self, kmers): try: # Skip primers that already exist and warn users existing = Primers.select_by_seqs(kmers) for p in existing: message("{} already exists in db, skipping...".format(p)) kmers = [p for p in kmers if p not in existing] except OperationalError: # If this fails due to an OperationalError, it probably means the # database tables haven't been created yet. error( "It doesn't appear that the workspace has been initialized: " "run `swga init' first.") mkdirp(output_dir) # Group the kmers by length to avoid repeatedly counting kmers of the # same size kmers_by_length = defaultdict(list) for kmer in kmers: kmers_by_length[len(kmer)].append(kmer) for k, mers in kmers_by_length.items(): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1) primers = [] for mer in mers: try: primers.append(primer_dict(mer, fg, bg, 0, INF, INF)) except KeyError: message( "{} does not exist in foreground genome, skipping..." .format(mer)) # Omitting any primers that were returned empty # primers = filter(lambda p: p == {}, primers) chunk_size = 199 message( "Writing {n} {k}-mers into db in blocks of {cs}..." .format(n=len(primers), k=k, cs=chunk_size)) Primers.add(primers, add_revcomp=False)
def count_specific_kmers(self, kmers): try: # Skip primers that already exist and warn users existing = Primers.select_by_seqs(kmers) for p in existing: message("{} already exists in db, skipping...".format(p)) kmers = [p for p in kmers if p not in existing] except OperationalError: # If this fails due to an OperationalError, it probably means the # database tables haven't been created yet. error("It doesn't appear that the workspace has been initialized: " "run `swga init' first.") mkdirp(output_dir) # Group the kmers by length to avoid repeatedly counting kmers of the # same size kmers_by_length = defaultdict(list) for kmer in kmers: kmers_by_length[len(kmer)].append(kmer) for k, mers in kmers_by_length.items(): fg = swga.kmers.count_kmers(k, self.fg_genome_fp, output_dir, 1) bg = swga.kmers.count_kmers(k, self.bg_genome_fp, output_dir, 1) primers = [] for mer in mers: try: primers.append(primer_dict(mer, fg, bg, 0, INF, INF)) except KeyError: message( "{} does not exist in foreground genome, skipping...". format(mer)) # Omitting any primers that were returned empty # primers = filter(lambda p: p == {}, primers) chunk_size = 199 message("Writing {n} {k}-mers into db in blocks of {cs}...".format( n=len(primers), k=k, cs=chunk_size)) Primers.add(primers, add_revcomp=False)
def summarize(self): """Output the number of primers currently in list.""" message('{} primers satisfy all filters so far.'.format(self.n)) return self