def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel log_filenames = _find_output_logs(in_data.identifier) assert log_filenames results = {} # dict of sample -> dictionary of output for filename in log_filenames: # <path>/<sample>.log path, file_ = os.path.split(filename) f, e = os.path.splitext(file_) assert e == ".log" sample = f results[sample] = alignlib.parse_bowtie1_output(filename) # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def read_matrices(filenames): from genomicode import parselib from genomicode import matrixlib if not filenames: return [] x = matrixlib.read_matrices(filenames) DATA, ALIGNED = x for d, filename in zip(DATA, filenames): f = os.path.split(filename)[1] print "%s has %s genes and %s samples." % ( f, parselib.pretty_int(d.nrow()), parselib.pretty_int(d.ncol())) if len(filenames) > 1: print "The merged file has %s genes." % \ parselib.pretty_int(ALIGNED[0].nrow()) sys.stdout.flush() return ALIGNED
def resolve_sequence( name, bp_upstream, length, ra_chrom_path, knowngene_file, default_transcript=None, skip_unknown_genes=False): from genomicode import parselib from genomicode import genomelib transcript_num = default_transcript gene_symbol = name if "," in name: gene_symbol, transcript_num = name.split(",", 1) assert transcript_num, "Invalid gene symbol: %r" % name transcript_num = int(transcript_num) proms = genomelib.get_promoters( gene_symbol, -bp_upstream, length, gene_file=knowngene_file, ra_path=ra_chrom_path) if not proms and skip_unknown_genes: return None assert proms, "I could not find gene: %s" % gene_symbol # If there is only 1 promoter, then use that one. if len(proms) == 1 and transcript_num is None: transcript_num = 0 if len(proms) > 1 and transcript_num is None: print "Multiple transcripts for %s." % gene_symbol print "Please specify a specific transcript." x = "Index", "Chrom", "Strand", "TSS" print "\t".join(map(str, x)) for i in range(len(proms)): chrom, tss, strand, start, length, seq = proms[i] tss = parselib.pretty_int(tss) x = i, chrom, strand, tss print "\t".join(map(str, x)) sys.exit(0) assert transcript_num is not None assert transcript_num >= 0 and transcript_num < len(proms), \ "Invalid transcript: %s" % transcript_num x = proms[transcript_num] chrom, tss, txn_strand, prom_base, prom_length, prom_seq = x return gene_symbol, chrom, prom_base, prom_length, txn_strand, tss
def pretty_runtime(start_time, end_time): from genomicode import parselib x = end_time-start_time fracs = x - int(x) fracs = int(fracs * 1000) x = int(x) num_hours = x / 3600 x = x % 3600 num_secs = x % 60 num_mins = x / 60 if num_hours == 0 and num_mins == 0: run_time = "%ss" % parselib.pretty_int(num_secs) elif num_hours == 0: run_time = "%02d:%02d.%03d" % (num_mins, num_secs, fracs) else: run_time = "%02d:%02d:%02d.%03d" % ( num_hours, num_mins, num_secs, fracs) return run_time
def summarize_report(filenames, matrices, num_factors, start_time, file_layout): import time import subprocess from genomicode import htmllib from genomicode import parselib #def highlight(s): # return htmllib.SPAN(s, style="background-color:yellow") assert len(filenames) == len(matrices) lines = [] w = lines.append w("<HTML>") w(htmllib.HEAD(htmllib.TITLE("BFRMNormalize Report"))) w("<BODY>") w(htmllib.CENTER(htmllib.H1(htmllib.EM("BFRMNormalize") + " Report"))) w(htmllib.H3("I. Overview")) files = [os.path.split(x)[1] for x in filenames] x1 = "one data set" if len(files) > 1: x1 = "the following data sets" x2 = "factor" if num_factors > 1: x2 = "factors" x = "I normalized %s using %d %s." % (x1, num_factors, x2) l = [x] for i in range(len(files)): name = files[i] num_samples = matrices[i].ncol() x = "%s (%d samples)" % (name, num_samples) l.append(htmllib.LI() + x) l = "\n".join(l) w(htmllib.UL(l)) w(htmllib.P()) x = os.path.split(file_layout.DS_PROC)[1] w("The merged gene expression data set is available at " + htmllib.A(x, href=x) + ".") w(htmllib.BR()) x = os.path.split(file_layout.DS_FINAL)[1] w("The normalized data set is available at " + htmllib.A(x, href=x) + ".") w(htmllib.P()) w(htmllib.H3("II. Results")) # Make the table of the heatmaps. x = os.path.split(file_layout.DS_PROC_HEATMAP)[1] x1 = htmllib.CENTER( htmllib.B("Before Normalization") + htmllib.BR() + htmllib.A(htmllib.IMG(height=480, src=x), href=x)) x = os.path.split(file_layout.DS_FINAL_HEATMAP)[1] x2 = htmllib.CENTER( htmllib.B("After Normalization") + htmllib.BR() + htmllib.A(htmllib.IMG(height=480, src=x), href=x)) row1 = htmllib.TR(htmllib.TD(x1) + htmllib.TD(x2)) x = htmllib.TD( htmllib.B("Figure 1: Heatmaps. ") + "These heatmaps show the expression patterns in the data before " "and after normalization. " "The rows contain the %d genes that exhibit the highest variance " "in gene expression across the original data set. " "The columns contain the samples in the data sets provided. " "The genes and samples are in the same order in both heatmaps. " "Warm colors indicate high expression of the gene, and cool colors " "indicate low expression." % NUM_FILTERED_GENES, colspan=2) row2 = htmllib.TR(x) w(htmllib.TABLE(row1 + row2, border=0, cellspacing=10, width="50%%")) w(htmllib.P()) # Make the table of the scatter plots. x = os.path.split(file_layout.DS_PROC_SCATTER)[1] x1 = htmllib.CENTER( htmllib.B("Before Normalization") + htmllib.BR() + htmllib.A(htmllib.IMG(height=400, src=x), href=x)) x = os.path.split(file_layout.DS_FINAL_SCATTER)[1] x2 = htmllib.CENTER( htmllib.B("After Normalization") + htmllib.BR() + htmllib.A(htmllib.IMG(height=400, src=x), href=x)) row1 = htmllib.TR(htmllib.TD(x1) + htmllib.TD(x2)) x1 = ( "These plots show the samples projected onto the first two principal " "components of the expression profiles of the %d genes that " "exhibit the highest variance across the original data set. " % NUM_FILTERED_GENES) x2 = ("Each point represents a sample, and samples from the same data " "set have the same color. " "If there are batch effects, the samples from the same data set " "(the same color) will cluster together. " "If there are no batch effects, the colors should be mixed.") if len(filenames) == 1: x2 = "" x = htmllib.TD(htmllib.B("Figure 2: PCA Plots. ") + x1 + x2, colspan=2) row2 = htmllib.TR(x) w(htmllib.TABLE(row1 + row2, border=0, cellspacing=10, width="50%%")) # Format the current time. end_time = time.time() time_str = parselib.pretty_date(start_time) x = int(end_time - start_time) num_min = x / 60 num_secs = x % 60 if num_min == 0: run_time = "%ss" % parselib.pretty_int(num_secs) else: run_time = "%sm %ss" % (parselib.pretty_int(num_min), num_secs) # Get the hostname. cmd = "hostname" p = subprocess.Popen(cmd, shell=True, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) wh, r = p.stdin, p.stdout wh.close() hostname = r.read().strip() assert hostname, "I could not get the hostname." w(htmllib.P()) w(htmllib.HR()) w( htmllib.EM( "This analysis was run on %s on %s. It took %s to complete." % (time_str, hostname, run_time))) w("</BODY>") w("</HTML>") x = "\n".join(lines) + "\n" outfile = file_layout.REPORT open(outfile, 'w').write(x)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel align_node = in_data x = filelib.list_files_in_path(align_node.identifier, endswith="align_summary.txt") align_filenames = x assert align_filenames, "Missing align_summary.txt" results = {} # dict of sample -> dictionary of output for filename in align_filenames: # Names must in the format: # <path>/<sample>.tophat/alignment_summary.txt # full_path <path>/<sample>.tophat # path <path> # tophat_dir <sample>.tophat # file_ accepted_hits.bam # sample <sample> full_path, file_ = os.path.split(filename) path, tophat_dir = os.path.split(full_path) assert file_ == "align_summary.txt" assert tophat_dir.endswith(".tophat") sample = tophat_dir[:-7] x = alignlib.parse_tophat_align_summary(filename) results[sample] = x # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): from genomicode import parselib from genomicode import parallel from Betsy import module_utils as mlib MAX_CORES = 4 # I/O intensive. fastq_node, sample_node, bam_node = antecedents bam_filenames = mlib.find_bam_files(bam_node.identifier) sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier, as_dict=True) metadata = {} jobs = [] # list of (sample, bam_file, fastq_file) for filename in bam_filenames: path, sample, ext = mlib.splitpath(filename) assert sample in sample2fastq, "Missing fastq: %s" % sample fastq1, fastq2 = sample2fastq[sample] x = sample, filename, fastq1 jobs.append(x) funcalls = [] for x in jobs: sample, bam_filename, fastq_filename = x # Count the number of reads. x1 = count_reads, (fastq_filename, ), {} # Count the number of alignments. x2 = count_alignments, (bam_filename, ), {} funcalls.append(x1) funcalls.append(x2) assert len(funcalls) == len(jobs) * 2 nc = min(num_cores, MAX_CORES) results = parallel.pyfun(funcalls, num_procs=nc) metadata["num_cores"] = nc # list of (sample, aligns, aligned_reads, total_reads, perc_aligned). results2 = [] for i, x in enumerate(jobs): sample, bam_filename, fastq_filename = x x1 = results[i * 2] x2 = results[i * 2 + 1] total_reads = x1 aligned_reads, alignments = x2 perc_aligned = float(aligned_reads) / total_reads x = sample, alignments, aligned_reads, total_reads, perc_aligned results2.append(x) results = results2 # sort by sample name results.sort() # Make table where the rows are the samples and the columns # are the statistics. table = [] header = ("Sample", "Alignments", "Aligned Reads", "Total Reads", "Perc Aligned") table.append(header) for x in results: sample, alignments, aligned_reads, total_reads, perc_aligned = x x1 = parselib.pretty_int(alignments) x2 = parselib.pretty_int(aligned_reads) x3 = parselib.pretty_int(total_reads) x4 = "%.2f%%" % (perc_aligned * 100) x = sample, x1, x2, x3, x4 assert len(x) == len(header) table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = mlib.findbin("txt2xls", quote=True) parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile)) return metadata
def main(): from optparse import OptionParser, OptionGroup import math from genomicode import genomelib from genomicode import motiflib from genomicode import parselib usage = "usage: %prog [options] <GENE SYMBOL>[,#] [...]" parser = OptionParser(usage=usage, version="%prog 01") # Matrix options. parser.add_option( "-m", "--matrix", dest="matrices", default=[], action="append", help="Plot binding sites for this matrix or gene. " "Format: <matrix>[,<color>[,<f|o>]. Color is 0xRRGGBB format. " "f(illed)|o(outline); def=f.") parser.add_option( "--all_matrices", default=False, action="store_true", help="Search all matrices.") # Gene options. parser.add_option( "--genome", default="hg19", help="Genome to search, e.g. hg19 (default), mm11.") parser.add_option( "--upstream", dest="upstream", default=250, type="int", help="Number of base pairs upstream of the TSS (def 250).") parser.add_option( "--downstream", dest="downstream", default=50, type="int", help="Number of base pairs downstream of the TSS (def 50).") parser.add_option( "-s", dest="strict", default=False, action="store_true", help="Use strict checking of gene names.") # Plotting options. parser.add_option( "--output_as_table", default=False, action="store_true") parser.add_option( "--pvalue", dest="pvalue_cutoff", default=None, type="float", help="p-value cutoff.") parser.add_option( "--max_genes", dest="max_genes", default=None, type="int", help="Maximum number of genes to plot.") parser.add_option( "-l", dest="label", default=False, action="store_true", help="Label the gene names.") parser.add_option( "-g", dest="gain", default=1, type="float", help="Increase the gain of the colors of the TFBS (def 1).") parser.add_option( "--format", dest="image_format", type="choice", choices=["png", "svg"], default="png", help="Image format: png (default) or svg.") # Running options. parser.add_option( "-j", "--jobname", dest="jobname", type="string", default="out", help="Name of output file.") parser.add_option( "--num_procs", dest="num_procs", type="int", default=1, help="Number of jobs to run in parallel.") options, args = parser.parse_args() if not args: parser.error("Please specify a gene to analyze.") if options.num_procs < 1 or options.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") ra_chrom_path, knowngene_file = resolve_genome_files(options.genome) gene_symbols = [] for symbol_or_file in args: x = resolve_symbol_or_file(symbol_or_file) gene_symbols.extend(x) default_transcript = None skip_unknown_genes = False if not options.strict: default_transcript = 0 skip_unknown_genes = True if not options.matrices and not options.all_matrices: parser.error("Please specify a matrix to plot.") if options.upstream < 0: parser.error("Upstream should be 0 or positive.") if options.downstream < 0: parser.error("Downstream should be 0 or positive.") seq_length = options.upstream + options.downstream if seq_length <= 0: parser.error("No sequence.") # Choose a plotting library. if options.jobname.lower().endswith(".png"): options.jobname = options.jobname[:-4] if options.image_format == "svg": plotlib = __import__( "genomicode.svgplot", globals(), locals(), ["svgplot"]) outfile = "%s.svg" % options.jobname else: plotlib = __import__( "genomicode.pilplot", globals(), locals(), ["pilplot"]) outfile = "%s.png" % options.jobname # Figure out the genes to plot. GENES = [] # list of gene_symbol, chrom, start, length, strand, tss for name in gene_symbols: # Figure out where the gene lies on the chromosome. x = resolve_sequence( name, options.upstream, seq_length, ra_chrom_path, knowngene_file, default_transcript=default_transcript, skip_unknown_genes=skip_unknown_genes) if x is None: continue GENES.append(x) if options.max_genes is not None: if options.max_genes <= 0: parser.error("Invalid max_genes.") GENES = GENES[:options.max_genes] # Figure out the matrices to search for. x = resolve_matrices(options.matrices, options.all_matrices) matrices, matrix2color, matrix2style = x # Constants governing how the figures are drawn. PIXELS_PER_BP = 4 # How many pixels for each base pair. X_BORDER = 5 # Number of pixels of border around the figure. Y_BORDER = 5 GENE_BUFFER = 4 # Buffer between genes, in pixels. SEQ_COLOR = (0, 0, 0) # Color for the sequence. SEQ_WIDTH = 1 # Width of line for sequence, in pixels. TSS_HEIGHT = 3 # Height of the TSS, in base pairs. TSS_WIDTH = TSS_HEIGHT*2 # Width of the TSS TFBS_HEIGHT = 2 # Height of TFBS, in base pairs. HASH_INTERVAL = 50 # Number of base pairs per hash mark. HASH_HEIGHT = 0.5 # Height of hash marks, in base pairs LABEL_HEIGHT = 5 # Height of label, in base pairs. NUM_GENES = len(GENES) # Figure out the geometry for a single gene. plot_width = int(PIXELS_PER_BP * seq_length) # Hack: Add some to TFBS_HEIGHT to handle overlapping TFBS. plot_height = int(PIXELS_PER_BP * max(TSS_HEIGHT, TFBS_HEIGHT+6)*2) # Figure out the geometry for the entire figure. total_width = X_BORDER*2 + plot_width total_height = ( Y_BORDER*2 + plot_height*NUM_GENES + GENE_BUFFER*(NUM_GENES-1)) image = plotlib.image(total_width, total_height) if options.output_as_table: header = [ "Gene Symbol", "Chromosome", "Strand", "Transcription Start", "Matrix ID", "Matrix Gene Symbol", "TFBS Pos", "TFBS Strand", "TFBS Offset", "P-value", "Sequence"] print "\t".join(header) # Plot each gene. for gene_num, x in enumerate(GENES): gene_symbol, chrom, gen_start, gen_length, txn_strand, tss = x if not options.output_as_table: print "%s: %+d to %+d relative to TSS at chr%s:%s:%s." % ( gene_symbol, -options.upstream, -options.upstream+seq_length, chrom, parselib.pretty_int(tss), txn_strand) # Get the TFBS from that site. nlp_cutoff = 0 if options.pvalue_cutoff: nlp_cutoff = -math.log(options.pvalue_cutoff) data = motiflib.score_tfbs_genome( chrom, gen_start, gen_length, matrices=matrices, nlp=nlp_cutoff, num_procs=options.num_procs, ra_path=ra_chrom_path) ## # If multiple matrices for the same gene symbol hit the same ## # place, then keep the one with the highest NLP. ## # Sort by gene_symbol, position, decreasing NLP. ## x = [(matid2info[x[0]].Gene_Symbol, x[3], -x[4], x) for x in data] ## x.sort() ## data = [x[-1] for x in x] ## i = 0 ## while i < len(data)-1: ## di, dj = data[i], data[i+1] ## gs0 = matid2info[di[0]].Gene_Symbol ## gs1 = matid2info[dj[0]].Gene_Symbol ## # Same matrix and position. ## if gs0 == gs1 and di[3] == dj[3]: ## del data[i+1] ## else: ## i += 1 # Sort by position relative to TSS, decreasing NLP x = [(genomelib.genbase2tssbase(x[3], tss, txn_strand), -x[4], x) for x in data] x.sort() data = [x[-1] for x in x] # Inspect the data to improve the formatting of the results. name_len = 0 gs_len = 0 tss_dist_len = 0 #NLP_len = 0 for x in data: matrix, chrom, strand, pos, NLP = x m = motiflib.matid2matrix(matrix) name_len = max(name_len, len(matrix)) gs_len = max(gs_len, len(m.gene_symbol)) tss_dist = genomelib.calc_tss_seq_dist( tss, txn_strand, pos, m.length) tss_dist_len = max( tss_dist_len, len(parselib.pretty_int(tss_dist))) #NLP_len = max(NLP_len, len("%.2f" % NLP)) # Print the data. binding_sites = [] for x in data: matrix, chrom, strand, pos, NLP = x m = motiflib.matid2matrix(matrix) x = matrix, pos, m.length, strand, NLP binding_sites.append(x) # Get the matrix sequence, with some flanking bases. FLANK = int(math.ceil((20-m.length)/2.0)) FLANK = max(FLANK, 2) left_flank = min(FLANK, pos) right_flank = FLANK seq_pos = pos - left_flank seq_len = m.length + left_flank + right_flank seq = genomelib.get_sequence( chrom, seq_pos, seq_len, ra_path=ra_chrom_path) s1 = seq[:left_flank] s2 = seq[left_flank:left_flank+m.length] s3 = seq[left_flank+m.length:] seq = s1.lower() + s2.upper() + s3.lower() # Print to stdout. mat_strand = "+" if strand != txn_strand: mat_strand = "-" tss_dist = genomelib.calc_tss_seq_dist( tss, txn_strand, pos, m.length) pvalue = parselib.pretty_pvalue(math.exp(-NLP), nsig=2) if options.output_as_table: x = (gene_symbol, chrom, txn_strand, tss, matrix, m.gene_symbol, pos, strand, tss_dist, pvalue, seq) assert len(x) == len(header) print "\t".join(map(str, x)) else: #print " %-*s [%*s] %s:%s:%s (%*s:%s) %*.2f %s" % ( print " %-*s [%*s] %s:%s (%*s) %-8s %s" % ( gs_len, m.gene_symbol, name_len, matrix, parselib.pretty_int(pos), strand, tss_dist_len, parselib.pretty_int(tss_dist), pvalue, seq) # Initialize some objects to plot the figures. bp_start = gen_start if txn_strand == "-": bp_start = gen_start+gen_length x_bp = bp_start y_bp = 0 x_pix = X_BORDER y_pix = Y_BORDER + gene_num*(plot_height+GENE_BUFFER) + plot_height/2 bp2pix = BasePair2PixConverter( PIXELS_PER_BP, x_bp, y_bp, x_pix, y_pix, txn_strand) seq_layout = SequenceLayout( bp2pix, SEQ_WIDTH, TSS_HEIGHT, TSS_WIDTH, HASH_INTERVAL, HASH_HEIGHT, SEQ_COLOR) tfbs_layout = TFBSLayout( bp2pix, TFBS_HEIGHT, matrix2color, matrix2style, options.gain) label_layout = None if options.label: label_layout = LabelLayout(bp2pix, LABEL_HEIGHT) # Actually plot the figures. plot( plotlib, image, seq_layout, tfbs_layout, label_layout, gene_symbol, gen_start, gen_length, tss, binding_sites) sys.stdout.flush() plotlib.write(image, open(outfile, 'w'))
genes = genomelib.filter_unique_tss(x) assert genes # If there is only 1 gene, then use that one. if len(genes) == 1 and transcript_num is None: transcript_num = 0 if len(genes) > 1 and transcript_num is None: print "Multiple transcripts for %s." % gene_symbol print "Please specify a specific transcript." x = "Index", "Chrom", "Strand", "TSS" print "\t".join(map(str, x)) for i in range(len(genes)): tss = genomelib.transcript2tss( genes[i].txn_start, genes[i].txn_length,genes[i].strand) tss = parselib.pretty_int(tss) x = i, genes[i].chrom, genes[i].strand, tss print "\t".join(map(str, x)) return assert transcript_num is not None assert transcript_num >= 0 and transcript_num < len(genes), \ "Invalid transcript: %s" % transcript_num gene = genes[transcript_num] x = genomelib.transcript2promoter( gene.txn_start, gene.txn_length, gene.strand, gene_offset, gene_length) base, length, strand = x chrom = gene.chrom chrom_base = base
def main(): import os import sys import time import argparse #import multiprocessing from genomicode import aptamerlib from genomicode import parselib parser = argparse.ArgumentParser(description="Pull out a subset of reads.") parser.add_argument("sequence_file", help="FASTQ-formatted sequence file.") parser.add_argument("-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument("--match_file", help="File for reads that match this library.") parser.add_argument("--leftover_file", help="File for leftover reads that don't match.") parser.add_argument("--clobber", default=False, action="store_true") parser.add_argument( "--min_seqlen", type=int, default=None, help="Discard sequences less than this minimum length.") parser.add_argument("--library_file", help="Want reads that match this library.") parser.add_argument( "--titles", default=[], action="append", help="Want reads with these titles. " "Comma-separated titles, parameter can be used multiple times.") args = parser.parse_args() # Check the inputs. assert os.path.exists(args.sequence_file), \ "File not found: %s" % args.sequence_file assert args.num_procs >= 1 and args.num_procs < 256 assert args.min_seqlen is None or (args.min_seqlen >= 0 and args.min_seqlen < 100) assert not args.library_file or os.path.exists(args.library_file), \ "File not found: %s" % args.library_file assert args.match_file, "Please specify a match_file." if not args.clobber and (args.match_file and os.path.exists(args.match_file)): raise AssertionError, ("match_file %s exists. " "Please use --clobber to overwrite." % args.match_file) if not args.clobber and (args.leftover_file and os.path.exists(args.leftover_file)): raise AssertionError, ("leftover_file %s exists. " "Please use --clobber to overwrite." % args.leftover_file) titles = _parse_titles(args.titles) match_handle = open(args.match_file, 'w') leftover_handle = None if args.leftover_file: leftover_handle = open(args.leftover_file, 'w') library = None if args.library_file: library = aptamerlib.read_library(args.library_file) #manager = multiprocessing.Manager() #lock = manager.Lock() #pool = multiprocessing.Pool(args.num_procs) TIME_FORMAT = "%m/%d/%Y %H:%M:%S" last_time = None for i, x in enumerate(aptamerlib.parse_fastq(args.sequence_file)): title, sequence, quality = x t = time.time() if last_time is None or t > last_time + 5: last_time = t now = time.strftime(TIME_FORMAT, time.localtime(t)) print "%s\t%s" % (now, "Extracting read %s." % parselib.pretty_int(i + 1)) sys.stdout.flush() if args.min_seqlen is not None and len(sequence) < args.min_seqlen: continue is_match = False # Keep if either the title matches or the library matches. if titles: if title in titles: is_match = True if library: orientation = aptamerlib.guess_sequence_orientation( sequence, library) assert orientation in [-1, 0, 1] if orientation in [-1, 1]: is_match = True if is_match: # Matches the library. print >> match_handle, title print >> match_handle, sequence print >> match_handle, "+" print >> match_handle, quality elif leftover_handle: print >> leftover_handle, title print >> leftover_handle, sequence print >> leftover_handle, "+" print >> leftover_handle, quality match_handle.close() if leftover_handle: leftover_handle.close()
def main(): from optparse import OptionParser, OptionGroup from genomicode import genomelib from genomicode import primer3 from genomicode import parselib # gene E2F1 ENSA,0 (<gene>,<tss>). usage = "usage: %prog [options] <gene>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("--product_size", dest="product_size", default=[], action="append", help="Add a product size to search, e.g. 75-100.") parser.add_option("-n", dest="num_primers", type="int", default=None, help="Number of primer pairs to pick.") parser.add_option("-v", dest="verbose", action="store_true", default=False, help="Make output verbose.") options, args = parser.parse_args() if len(args) != 1: print usage sys.exit(-1) gene, = args assert options.num_primers is None or options.num_primers > 0 if not options.product_size: # Default Product Size Range for web is: # 150-250 100-300 301-400 401-500 501-600 601-700 701-850 851-1000 options.product_size = [ (50, 100), (101, 150), (151, 200), (201, 300), ] for mn, mx in options.product_size: assert mn > 0 assert mn < mx gene_symbol, transcript_num = gene.upper(), None # See if a transcript num was specified. if "," in gene_symbol: gene_symbol, x = gene_symbol.split(",", 1) transcript_num = int(x) assert transcript_num >= 0 genes = genomelib.get_gene_coords(gene_symbol) assert genes # If there is only 1 gene, then use that one. if len(genes) == 1 and transcript_num is None: transcript_num = 0 if len(genes) > 1 and transcript_num is None: print "Multiple transcripts for %s." % gene_symbol print "Please specify a specific transcript." x = "Index", "Chrom", "Strand", "TSS" print "\t".join(map(str, x)) for i in range(len(genes)): tss = genomelib.transcript2tss(genes[i].txn_start, genes[i].txn_length, genes[i].strand) tss = parselib.pretty_int(tss) x = i, genes[i].chrom, genes[i].strand, tss print "\t".join(map(str, x)) return assert transcript_num is not None assert transcript_num >= 0 and transcript_num < len(genes), \ "Invalid transcript: %s" % transcript_num gene = genes[transcript_num] seq = genomelib.get_transcript(gene.chrom, gene.strand, gene.txn_start, gene.txn_length, gene.exon_starts, gene.exon_lengths) #print gene.txn_start, gene.txn_length #genomelib.write_fasta("HELLO", seq) # Search for primers on just the exons. exon_seq = ExonSequence(seq) primers = primer3.primer3(exon_seq.sub_seq, product_size=options.product_size, num_return=options.num_primers) if not primers: print "No primers found." return # Revcomp the right primer. for d1, d2, size in primers: d2.seq_rc = genomelib.revcomp(d2.seq) if options.verbose: x = [ "Index", "L_Seq", "L_Pos", "L_Length", "L_Tm", "L_GC", "L_Exon", "R_Seq", "R_Pos", "R_Length", "R_Tm", "R_GC", "R_Exon", "Genome Size", "Product Size" ] print "\t".join(x) for zzz, x in enumerate(primers): d1, d2, size = x # Calculate the size of the genomic product. gen_left = exon_seq.full_seq.find(d1.seq.upper()) gen_right = exon_seq.full_seq.find(d2.seq_rc.upper()) gen_size = 0 if gen_left >= 0 and gen_right >= 0: gen_size = gen_right - gen_left + len(d2.seq_rc) # Calculate the size of the PCR product. pcr_left = exon_seq.sub_seq.find(d1.seq.upper()) pcr_right = exon_seq.sub_seq.find(d2.seq_rc.upper()) pcr_size = pcr_right - pcr_left + len(d2.seq) assert pcr_size == size # Figure out the exons of the primers. L_exon_ID = [ exon_seq.sub_exonid[pcr_left + i] for i in range(len(d1.seq)) ] R_exon_ID = [ exon_seq.sub_exonid[pcr_right + i] for i in range(len(d2.seq)) ] L_exon_ID_str = format_exon_ids(L_exon_ID) R_exon_ID_str = format_exon_ids(R_exon_ID) # Ignore primers on the same exon. if L_exon_ID == R_exon_ID: continue if options.verbose: x = (zzz + 1, d1.seq, pcr_left, d1.length, d1.tm, d1.gc_percent, L_exon_ID_str, d2.seq, pcr_right, d2.length, d2.tm, d2.gc_percent, R_exon_ID_str, gen_size, size) print "\t".join(map(str, x)) else: print d1.seq print d2.seq print "Product Size=%d" % size print