def buildFastQCSummaryStatus(infiles, outfile, datadir): '''load fastqc status summaries into a single table.''' outf = IOTools.openFile(outfile, "w") names = set() results = [] for infile in infiles: track = P.snip(os.path.basename(infile), ".fastqc") filename = os.path.join(datadir, track + "*_fastqc", "fastqc_data.txt") # there can be missing sections for fn in glob.glob(filename): stats = collections.defaultdict(str) for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): stats[name] = status results.append((track, fn, stats)) names.update(stats.keys()) names = list(names) outf.write("track\tfilename\t%s\n" % "\t".join(names)) for track, fn, stats in results: outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats[x] for x in names))) outf.close()
def summarizeAllProcessing( infiles, outfile ): '''summarize processing information.''' outf = IOTools.openFile( outfile, "w" ) data = [] for infile in infiles: inf = IOTools.openFile( infile ) for line in inf: track, step, pair, ninput, noutput = line[:-1].split("\t") if track == "track": continue data.append( (track, step, pair, ninput, noutput) ) # sort by track, pair, input data.sort( key = lambda x: (x[0], x[2], -int(x[3]))) first = True for key, v in itertools.groupby( data, lambda x: (x[0], x[2])): vals = list(v) track,pair = key ninput = int(vals[0][3]) outputs = [int(x[4]) for x in vals] if first: outf.write( "track\tpair\tninput\t%s\t%s\t%s\t%s\n" % ("\t".join( [x[1] for x in vals] ), "noutput", "\t".join( ["percent_%s" % x[1] for x in vals] ), "percent_output" )) first = False outf.write( "%s\t%s\t%i\t%s\t%i\t%s\t%s\n" % ( track, pair, ninput, "\t".join( map(str,outputs)), outputs[-1], "\t".join( [ "%5.2f" % (100.0 * x / ninput) for x in outputs ] ), "%5.2f" % (100.0 * outputs[-1] / ninput))) outf.close()
def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = IOTools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = IOTools.openFile(infile) header = inf.readline() outf = IOTools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()
def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = IOTools.openFile(gtf) G = GTF.iterator(gfile) out = IOTools.openFile(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def buildInputFiles(infile, outfiles): ''' build input file based on parameters and fasta sequences that primers are to be designed for ''' PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0] fasta, identifiers = infile[0], "identifiers.tsv" inf = IOTools.openFile(fasta) E.info("Reading ids for primer design") ids = readIdentifiers(identifiers) E.info("collecting sequences") for f in FastaIterator.iterate(IOTools.openFile(fasta)): if f.title in ids: outf = IOTools.openFile(os.path.join( "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w") seq = f.sequence outf.write("SEQUENCE_ID=%s\n" % f.title) for key, value in PARAMS.iteritems(): if "constraints" in key: outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value)) outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq) outf.close()
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def copy(src, dst, name): # remove "template" and the pipeline type from file/directory # names. fn_dest = os.path.join( destination_dir, dst, rx_type.sub("", rx_file.sub(name, src))) fn_src = os.path.join(srcdir, "pipeline_template_data", src) E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" % (fn_src, fn_dest, src, dst)) if os.path.exists(fn_dest) and not options.force: raise OSError( "file %s already exists - not overwriting." % fn_dest) if fn_src.endswith(".png"): shutil.copyfile(fn_src, fn_dest) else: with IOTools.openFile(fn_dest, "w") as outfile: with IOTools.openFile(fn_src) as infile: for line in infile: outfile.write(rx_reportdir.sub(reportdir, rx_template.sub(name, line)))
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--fastq1", dest="fastq1", type="string", help="supply read1 fastq file" ) parser.add_option("-b", "--fastq2", dest="fastq2", type="string", help="supply read2 fastq file" ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) fastq1 = IOTools.openFile(options.fastq1) fastq2 = IOTools.openFile(options.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?") except PairedReadError, e: raise PairedReadError(e), None, sys.exc_info()[2] else: assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2" options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1
def buildForegroundSets(infiles, outfile): ''' build multiset of genes that are differentiallt expressed based on cluster assignments ''' clusters, probe2gene_file = infiles # read probe 2 gene map probe2gene = {} probe2gene_file = IOTools.openFile(probe2gene_file) for line in probe2gene_file.readlines(): data = line[:-1].split("\t") probe, gene = [x.replace('"', '') for x in data] probe2gene[probe] = gene # read probe 2 cluster map probe2cluster = {} clusters = IOTools.openFile(clusters) clusters.readline() for line in clusters.readlines(): data = line[:-1].split("\t") probe, cluster = data probe2cluster[probe] = cluster # output genes in each cluster for c in set(probe2cluster.values()): outname = "pathways.dir/C%s.foreground" % c outf = IOTools.openFile(outname, "w") for probe, cluster in probe2cluster.iteritems(): if cluster == c: outf.write("%s\n" % probe2gene[probe]) else: continue outf.close()
def mergeAdaptorFasta(infiles, outfile): ''' Merge fasta files of adapter contamination, include reverse complement, remove duplicate sequences ''' fasta_dict = {} for each in infiles: with IOTools.openFile(each, "r") as infle: for line in infle: if line[0] == '>': adapt = line.lstrip(">").rstrip("\n") fasta_dict[adapt] = set() fasta_dict[adapt + "_R"] = set() else: seq = line.rstrip("\n") rev_seq = reverseComplement(seq) fasta_dict[adapt].add(seq) fasta_dict[adapt + "_R"].add(rev_seq) # if there are no adapters to remove break the pipeline here if not len(fasta_dict): raise AttributeError("There are no overrepresented sequences in " "these fastq files. Please turn off this " "feature and re-run the pipeline") else: pass with IOTools.openFile(outfile, "w") as outfle: for key, value in fasta_dict.items(): outfle.write(">%s\n%s\n" % (key, list(value)[0]))
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip( infile, ".bed.gz" ) fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) ) outs = IOTools.openFile( outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )): lcontig = fasta.getLength( bed.contig ) if mode == "intervals": seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) ) ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) ) elif mode == "leftright": l = bed.end - bed.start start, end = max(0,bed.start-l), bed.end-l ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) start, end = bed.start+l, min(lcontig,bed.end+l) ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) ) seqs.append( fasta.getSequence( bed.contig, "+", start, end) ) masked = maskSequences( seqs, masker ) outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) ) outs.close()
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zapFile(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zapFile(outfile2)
def buildCoverageStats(infile, outfile): '''Generate coverage statistics for regions of interest from a bed file using Picard''' # TS check whether this is always required or specific to current baits file # baits file requires modification to make picard accept it # this is performed before CalculateHsMetrics to_cluster = USECLUSTER baits = PARAMS["roi_baits"] modified_baits = infile + "_temp_baits_final.bed" regions = PARAMS["roi_regions"] statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt; awk 'NR>2' %(baits)s | awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}' > %(infile)s_temp_baits.bed; cat %(infile)s_temp_header.txt %(infile)s_temp_baits.bed > %(modified_baits)s; checkpoint ; rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt ''' P.run() PipelineMappingQC.buildPicardCoverageStats( infile, outfile, modified_baits, modified_baits) IOTools.zapFile(modified_baits)
def buildFastQCSummaryStatus(infiles, outfile): '''load fastqc status summaries into a single table.''' outf = IOTools.openFile(outfile, "w") first = True for infile in infiles: track = P.snip(infile, ".fastqc") filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt") for fn in glob.glob(filename): prefix = os.path.basename(os.path.dirname(fn)) results = [] names, stats = [], [] for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): stats.append(status) names.append(name) if first: outf.write("track\tfilename\t%s\n" % "\t".join(names)) first = False outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats))) outf.close()
def extractEBioinfo(eBio_ids, vcfs, outfile): '''find the number of mutations identitified in previous studies (eBio_ids) for the mutated genes in the vcfs''' genes = set() n = 0 for vcf in vcfs: if n > 0: break else: n += 1 infile = VCF.VCFFile(IOTools.openFile(vcf)) for vcf_entry in infile: # assumes all vcf entries without "REJECT" are "PASS" if vcf_entry.filter != "REJECT": info_entries = vcf_entry.info.split(";") for entry in info_entries: if "SNPEFF_GENE_NAME" in entry: genes.update((entry.split("=")[1],)) eBio_ids = IOTools.openFile(eBio_ids, "r") tissue_counts = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.defaultdict(int))) for line in eBio_ids: tissue, study, table = line.strip().split("\t") for gene in genes: url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&" "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&" "gene_list=%(gene)s" % locals()) print url df = pd.io.parsers.read_csv(url, comment="#", sep="\t", header=False, index_col=0) # check dataframe contains data! if df.shape[0] != 0: tissue_counts[tissue][gene]["total"] += df.shape[1]-2 tissue_counts[tissue][gene]["mutations"] += int(df.count(1))-1 out = IOTools.openFile(outfile, "w") tissues = tissue_counts.keys() out.write("gene\t%s\n" % "\t".join([ "%s_frequency" % x.replace(" ", "_") for x in tissues])) for gene in genes: freq_values = [] for tissue in tissues: total = tissue_counts[tissue][gene]["total"] mutations = tissue_counts[tissue][gene]["mutations"] print "total: ", total, "mutations: ", mutations freq_values.append(np.divide(float(mutations), total)) out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values)))) out.close()
def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def fetchProbeFragments(probe_bed, digest_bed, outfile, lookup_out): digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] bed.start = frag.start bed.end = frag.end bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def getProbeFragments(probe_bed, digest_bed, outfile, lookup_out): # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates # from the pregenerated file. # First iteration, no comparison first_iteration = True length_RE_cut = 0 last_bed = None for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)): if(first_iteration): first_iteration = False else: # If they are in the same contig they can be compared if(bed_digest.contig == last_bed.contig): length_RE_cut = bed_digest.start - last_bed.end break last_bed = bed_digest digest_fragments = pysam.TabixFile(digest_bed) bed = Bed.Bed() with IOTools.openFile(outfile, "w") as outf, \ IOTools.openFile(lookup_out,"w") as lookup: lookup.write("probe\tfragment\n") for probe in Bed.iterator(IOTools.openFile(probe_bed)): frag = digest_fragments.fetch(probe.contig, probe.start, probe.end, parser=pysam.asBed()) frag = list(frag) if not len(frag) == 1: E.warn("%i fragments found for probe %s, skipping" % (len(frag), probe.name)) continue frag = frag[0] # The restriction enzyme cut on the left side of the fragment # is the end site of the last restriction enzyme fragment + 1 # (+1 because according to the manual coordinates are specified # in 1-origin for the bed start.) bed.start = frag.start-length_RE_cut+1 bed.end = frag.end+length_RE_cut bed.contig = frag.contig bed["name"] = probe.name bed["score"] = "." bed["strand"] = "+" lookup.write("%s\t%s\n" % (probe.name, frag.name)) outf.write(str(bed) + "\n")
def iterator_psl_intervals(options): """iterate over psl file yield an entry together with overlapping entries. returns tuples of (match, list(query_intervals), list(target_intervals)) """ if options.filename_filter_query: intervals_query = readIntervals( IOTools.openFile(options.filename_filter_query, "r"), options) else: intervals_query = None if options.filename_filter_target: intervals_target = readIntervals( IOTools.openFile(options.filename_filter_target, "r"), options) else: intervals_target = None iterator = Blat.BlatIterator(options.stdin) ninput = 0 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write("# progress: ninput=%i\n" % (ninput)) options.stdlog.flush() qx, tx = None, None if intervals_query: try: qx = list( intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo)) except KeyError: qx = [] if intervals_target: try: tx = list( intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo)) except KeyError: tx = [] if options.loglevel >= 2: options.stdlog.write( "###################################################\n") options.stdlog.write("# testing match %s\n" % (str(match))) options.stdlog.write( "###################################################\n") yield match, qx, tx
def getRunStatement(self, infile, outfile, controlfile): """ Generate a specific run statement for each peakcaller class """ # select location of the spp script to run if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default": executable = IOTools.which("run_spp.R") elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups": executable = IOTools.which("run_spp_nodups.R") else: executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"] try: os.path.exists(executable) except: raise IOError("SPP script not found: %s" % executable) # select the threshold for lax peak calling if self.PARAMS_PEAKCALLER["spp_options_npeaks"]: if self.PARAMS_PEAKCALLER["spp_options_fdr"]: raise Exception("Value specified for both SPP options" " -npeaks and -fdr please select one or" " other option, but not both") else: threshold = "-npeaks=" + \ str(self.PARAMS_PEAKCALLER["spp_options_npeaks"]) elif self.PARAMS_PEAKCALLER["spp_options_fdr"]: threshold = "-fdr=" + \ str(self.PARAMS_PEAKCALLER["spp_options_fdr"]) else: raise Exception("Must specify a value for either" " spp_options_npeaks or spp_options_fdr," " but not both") # build run statement for spp. # -savn is output.npeak.file (passed as NULL, # means filename based on infile) # -out is output.result.file # -odir defaults to os.path.dirname( infile ) # -savn is save narrowpeak file # -savr is save regionpeak file # (run_spp.R script throws an error if region peak is not output). statement = [("Rscript %(executable)s" " -c=%(infile)s" " -i=%(controlfile)s" " %(threshold)s" " -savn" " -savr")] # add additional options statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"]) # specify outfile statement.append(" -rf" " -out=/stats/phantomPeakStatsReps.tab" " >& %(outfile)s") statement = (" ".join(statement) % locals()) return statement
def loadFastqc(filename, backend="sqlite", database="csvdb", host="", username="", password="", port=3306): '''load FASTQC statistics into database. Each section will be uploaded to its own table. Arguments ---------- filename : string Filename with FASTQC data backend : string Database backend. Only this is required for an sqlite database. host : string Database host name username : string Database user name password : string Database password port : int Database server port. ''' parser = CSV2DB.buildParser() (options, args) = parser.parse_args([]) options.database_backend = backend options.database_host = host options.database_name = database options.database_username = username options.database_password = password options.database_port = port options.allow_empty = True for fn in glob.glob(filename): prefix = os.path.basename(os.path.dirname(fn)) results = [] for name, status, header, data in FastqcSectionIterator( IOTools.openFile(fn)): # do not collect basic stats, see loadFastQCSummary if name == "Basic Statistics": continue options.tablename = prefix + "_" + re.sub(" ", "_", name) inf = StringIO("\n".join([header] + data) + "\n") CSV2DB.run(inf, options) results.append((name, status)) # load status table options.tablename = prefix + "_status" inf = StringIO( "\n".join(["name\tstatus"] + ["\t".join(x) for x in results]) + "\n") CSV2DB.run(inf, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bamfile", dest="bam", type="string", help="BAM formated alignment file to test. Should have MD and NH tags set") parser.add_option("-t", "--quality-threshold", dest="threshold", type="int", default=30, help="minimum quality threshold for a mismatched base to count") parser.add_option("-f", "--fasta-path", dest="fastapath", type="string", help="path to indexed fasta file for genome of choice") parser.add_option("-p", "--vcf-path", dest="vcfpath", type="string", help="path to indexed vcf file for dataset of choice") parser.add_option("-d", "--sample", dest="samppattern", type="string", help="pattern to match and extract the donor name from the bam file, for use in parsing the vcf file") parser.add_option("-n", "--REDI-path", dest="redipath", type="string", help="path to Bed format REDIportal table containing RNA editing positions") (options, args) = E.Start(parser, argv=argv) bamfile = pysam.AlignmentFile(options.bam) fastafile = IndexedFasta(options.fastapath) vcffile = vcf.Reader(open(options.vcfpath,"r")) BEDREDI = Bed.readAndIndex(IOTools.openFile(options.redipath), with_values=True) options.stdout.write("\t".join(["gene_id", "strand", "mismatches", "bases", "low_qual", "a","t","c","g", "a_to_t","a_to_g","a_to_c", "t_to_a","t_to_g","t_to_c", "g_to_a","g_to_t","g_to_c", "c_to_a","c_to_t","c_to_g", "indel_count","RNA_editing_events"]) + "\n") samplepattern = options.samppattern (not_reverse_g_to_t, reverse_g_to_t) = 0, 0 donorfrombam = re.search(r"%s"%(samplepattern),options.bam,flags=0).group(1) # find the donorid: vcf_record = vcffile.next() samples = vcf_record.samples donors = [dnr.sample for dnr in samples] donorid = None for samp in donors: if donorfrombam in samp: donorid=samp if donorid is None: raise ValueError("Donor %s not found in VCF" % donorfrombam) reversecomplement = {"a":"t","t":"a","c":"g","g":"c"} for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): start = min(e.start for e in gene) end = max(e.end for e in gene) strand = gene[0].strand seq = fastafile.getSequence(gene[0].contig, "+", start, end) thischr = gene[0].contig.replace("chr","") reads = bamfile.fetch(gene[0].contig, start, end) if all("chr" in c for c in vcffile.contigs.keys()) == False: contig = (gene[0].contig).replace("chr","") if contig == "M": contig = contig + "T" else: contig = gene[0].contig try: vcfregion = vcffile.fetch(contig,start,end) except ValueError: vcfregion = [] regionchecker=list(vcfregion) BEDREDIregion = BEDREDI[gene[0].contig].find(start,end+1) editpositions = {edit_pos:edit_pos_field for edit_pos,edit_pos_plus,edit_pos_field in BEDREDIregion if edit_pos_field.fields[2] == strand} gene_id = gene[0].gene_id mm_count = 0 base_count = 0 skipped = 0 indel_count = 0 RNA_edits = 0 matched_bases = defaultdict(int) transition = {"a_to_t":0,"a_to_g":0,"a_to_c":0,"t_to_a":0,"t_to_g":0, "t_to_c":0,"g_to_a":0,"g_to_t":0,"g_to_c":0,"c_to_a":0,"c_to_t":0, "c_to_g":0} snp_dict={} for snp in regionchecker: if snp.genotype(donorid)["GT"] != "0/0": snp_dict[snp.POS -1] = snp.ALT for read in reads: if read.is_unmapped: continue if read.is_duplicate: continue if read.mate_is_unmapped: continue if read.get_tag("NH") > 1: continue qualities = read.query_qualities alignmentcigar = read.cigarstring indel_count += (alignmentcigar.count("I") + alignmentcigar.count("D")) alignment = read.get_aligned_pairs(with_seq=True) # list[:] is weird syntax for copying the list alignment = [base for base in alignment if not base[0] is None and not base[1] is None] testalignment = alignment[:] # base_count += sum(1 for base in alignment # if start <= base[1] < end and # base[2].lower() != "n") # total_alignment = [base for base in alignment if start <= base[1] < end and base[2].lower() != "n"] base_count += len(total_alignment) for base in total_alignment: if seq[(base[1])-start].lower() != base[2].lower(): if (testalignment[0][1] is None) or (testalignment[-1][1] is None): E.debug("first or last base of read is None") E.debug("read sequence is %s" %(testalignment)) E.debug("position of first base in genome: %s" %testalignment[0][1]) E.debug("position of last base in genome: %s" %testalignment[-1][1]) E.debug("identity of first base in genome: %s" %testalignment[0][2]) E.debug("identity of last base in genome: %s" %testalignment[-1][2]) raise ValueError else: E.debug("identity of error causing base from read sequence: %s" %(read.query_alignment_sequence)[base[0]].lower()) E.debug("read sequence: %s" %(read.query_alignment_sequence)) E.debug("identity start and end of read as calculated from start and end as described in gtffile and extracted from fasta: %s" %(seq[(testalignment[0][1]-start):(testalignment[-1][1]-start)])) E.debug("section of the read 10 bp downstream and upstream of the sequence containing the error extracted from the fasta: %s" %(seq[((base[1]-10)-start):((base[1]+10)-start)].lower())) E.debug("filename?: %s" %(read.tostring(bamfile))) E.debug("positions of start and end of the gene based on the gtf: %s,%s" %(start, end)) E.debug("identity of start of gene extratced from gtf: %s" %(seq[(base[1])-start])) E.debug("identity of error causing base from reference genome: %s" %base[2]) E.debug("position of base in read: %s" %base[0]) E.debug("position of base in genome: %s" %base[1]) E.debug("position of base in read as calculated from position of base in genome and and start from gtf: %s" %(base[1]-start)) E.debug("identity of error causing base (reference), calculated from fasta and testalignment info: %s" %(seq[(testalignment[0][1]-start):(testalignment[-1][1]-start)].upper()[base[0]])) #E.debug("position of base in read from first alignment genome base minus start plus position of base in in read, should equal position of base in read: %s" %((testalignment[0][1]-start) + base[0])) E.debug("identity of error causing base (reference), calculated from fasta and position of base in genome from aligned pairs: %s" %(seq[(base[1])-start])) #E.debug("position of start base in genome from the alignment minus position of start base in genome from the gtf, should be zero: %s" %(alignment[0][1]-start)) E.debug("complete aligned pairs, unfiltered: %s" %(testalignment)) E.debug("full fasta sequence of read: %s" %(textwrap.fill(seq,50))) raise ValueError else: matched_bases[base[2].lower()] += 1 try: if read.get_tag("NM") == 0: continue except KeyError: if read.get_tag("nM") == 0: continue # mismatches readseq = read.query_sequence def _is_snp(base): global got_snp_pos global wrong_base if snp_dict.has_key(base[1]): read_base = readseq[base[0]].lower() alt_base = snp_dict[base[1]][0].sequence.lower() got_snp_pos += 1 if read_base != alt_base: wrong_base += 1 return True else: return False else: return True def _is_indel(base): if (len(readseq) >= (base[0] + 5)): if (len(seq) < ((base[1] - start) + 5)): upperrange = len(seq)-(base[1]-start) lowerrange = 5 - upperrange readindelwindow = readseq[(base[0] - lowerrange):(base[0] + upperrange)] seqindelwindow = seq[(base[1] - start - lowerrange):(base[1] - start + upperrange)] matchwindows = list() for i in range(len(readindelwindow)): try: matchwindows.append((readindelwindow[i].lower()==seqindelwindow[i].lower())) except IndexError: print i print readindelwindow print seqindelwindow print start print lowerrange print upperrange print base[0] print (base[0] - lowerrange) print (base[0] + upperrange) print base[1] print (base[1] - start) print ((((base[1])-start) - lowerrange)-1) print ((((base[1])-start) + upperrange)-1) print readseq print seq print gene_id print gene[0].contig raise elif len(seq) >= (base[1] - start + 5): readindelwindow = readseq[base[0]:(base[0]+5)] seqindelwindow = seq[(base[1] - start):(base[1] - start + 5)] matchwindows = [] for i in range(len(readindelwindow)): try: matchwindows.append(readindelwindow[i].lower()==seqindelwindow[i].lower()) except IndexError: print i print readindelwindow print seqindelwindow print start print base[0] print base[1] print (base[1] - start) - 1 print ((base[1] - start) + 5) - 1 print readseq print seq print gene_id print gene[0].contig raise if matchwindows.count(False) >= 4: return False else: return True elif (len(readseq) < (base[0] + 5)): if len(seq) < (((base[1])-start) + 5): readsequpperrange = len(readseq)-base[0] readseqlowerrange = 5 - readsequpperrange sequpperrange = len(seq) - (base[1] - start) seqlowerrange = 5 - sequpperrange if readsequpperrange < sequpperrange: upperrange = readsequpperrange lowerrange = readseqlowerrange elif sequpperrange < readsequpperrange: upperrange = sequpperrange lowerrange = seqlowerrange elif sequpperrange == readsequpperrange: upperrange = sequpperrange lowerrange = seqlowerrange elif ((base[1] - start) - 4) < 0: return True else: upperrange = len(readseq)-base[0] lowerrange = 5 - upperrange readindelwindow=readseq[(base[0] - lowerrange):(base[0] + upperrange)] seqindelwindow=seq[(((base[1])-start) - lowerrange):(((base[1])-start)+ upperrange)] matchwindows=[] for i in range(len(readindelwindow)): try: matchwindows.append((readindelwindow[i].lower()==seqindelwindow[i].lower())) except IndexError: print i print readindelwindow print seqindelwindow print start print lowerrange print upperrange print base[0] print (base[0] - lowerrange) print (base[0] + upperrange) print base[1] print (base[1] - start) print ((((base[1])-start) - lowerrange)) print ((((base[1])-start) + upperrange)) print readseq print seq print gene_id print gene[0].contig raise if matchwindows.count(False) >= 4: return False else: return True def _is_RNA_edit(base,editpositions): global got_edit_pos global wrong_edit_base genomebase = base[2] readbase = readseq[base[0]].lower() if not base[1] in editpositions.keys() or \ genomebase == "n" or \ readbase == "n" or \ not genomebase.islower(): return True else: got_edit_pos += 1 if genomebase == editpositions[base[1]].fields[0].lower() and \ readbase == editpositions[base[1]].fields[1].lower(): return False else: wrong_edit_base += 1 return True for base in total_alignment: if _is_RNA_edit(base,editpositions) == False: RNA_edits += 1 mismatches = [base for base in total_alignment if base[2].islower() and qualities[base[0]] >= options.threshold and _is_snp(base) and _is_indel(base) and _is_RNA_edit(base,editpositions) and readseq[base[0]].lower() != "n"] total_mm = sum(1 for base in total_alignment if base[2].islower() and _is_snp(base) and readseq[base[0]].lower() != "n") hq_mm = len(mismatches) for base in mismatches: genomebase = base[2].lower() readbase = readseq[base[0]].lower() try: if strand == "-": revgenomebase = reversecomplement[genomebase] revreadbase = reversecomplement[readbase] if revgenomebase == "g" and revreadbase == "a": if read.is_reverse: reverse_g_to_t += 1 else: not_reverse_g_to_t +=1 transition["%s_to_%s"%(revgenomebase, revreadbase)] += 1 else: transition["%s_to_%s"%(genomebase, readbase)] += 1 except KeyError: print transition print read.query_alignment_sequence.upper() print seq[(alignment[0][1]-start):(alignment[-1][1]-start)].upper() print read.tostring(bamfile) raise mm_count += hq_mm skipped += total_mm - hq_mm outline = "\t".join(map(str,[gene_id, strand, mm_count, base_count, skipped, matched_bases['a'], matched_bases['t'], matched_bases['c'], matched_bases['g'], transition['a_to_t'], transition['a_to_g'], transition['a_to_c'], transition['t_to_a'], transition['t_to_g'], transition['t_to_c'], transition['g_to_a'], transition['g_to_t'], transition['g_to_c'], transition['c_to_a'], transition['c_to_t'], transition['c_to_g'], indel_count, RNA_edits])) options.stdout.write(outline + "\n") # write footer and output benchmark information. E.info("Out of %i mismatches at snp positions %i were the wrong base" %(got_snp_pos, wrong_base)) E.info("Out of %i mismatches at RNA edit positions %i were the wrong base" %(got_edit_pos, wrong_edit_base)) E.info("Out of %i g_to_c transitions on - strand genes, the read was on the + strand %i times" % (not_reverse_g_to_t, reverse_g_to_t)) E.Stop()
def peekParameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = getCallerLocals() # check if we should raise errors if on_error_raise is None: on_error_raise = not isTest() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} # Attempt to locate directory with pipeline source code. This is a # patch as pipelines might be called within the repository # directory or from an installed location dirname = PARAMS["pipelinedir"] # called without a directory, use current directory if dirname == "": dirname = os.path.abspath(".") else: # if not exists, assume we want version located # in directory of calling script. if not os.path.exists(dirname): # directory is path of calling script dirname = os.path.dirname(caller_locals['__file__']) pipeline = os.path.join(dirname, pipeline) if not os.path.exists(pipeline): if on_error_raise: raise ValueError( "can't find pipeline at %s" % (pipeline)) else: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!": workingdir = os.path.join(PARAMS.get("pipelinedir"), IOTools.snip(pipeline, ".py")) if not os.path.exists(workingdir): if on_error_raise: raise ValueError( "can't find working dir %s" % workingdir) else: return {} statement = "python %s -f -v 0 dump" % pipeline process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): dump[key] = os.path.join(workingdir, value) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv profiles = iCLIP.getters.profiles.keys() # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--profile", dest="profile", type="choice", choices=profiles, default="iclip", help="Experiment profile to use. Sets various things" "about obtaining 1-bp position from read. Options are" " %s" % ", ".join(profiles)) parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=None, help="Use centre of read rather than frist base." "Overrides profile") parser.add_option( "-f", "--format", dest="format", choices=[ "bigWig", "bigwig", "BigWig", "bedGraph", "bg", "bedgraph", "bed", "Bed", "BED" ], help="Output format. Either bigWig (2 files, + and - strand)" ", bedGraph (2 files), or bed (1 file, depth in column 5," "strand in column 6", default="bigWig") parser.add_option("-w", "--wig", dest="output_wig", action="store_true", default=False, help="Write output to bedgraph file rather than bigwig") parser.add_option("--dtype", dest="dtype", type="string", default="uint32", help="dtype for storing depths") parser.add_option( "--cpm", dest="cpm", action="store_true", default=False, help= "Normalize output depths to number of mapped reads (in millions) in BAM" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) options.format = options.format.lower() if options.format == "bg": options.format = "bedgraph" profile = iCLIP.getters.profiles[options.profile] if options.centre is not None: centre = True else: centre = profile.centre if options.stdin == sys.stdin: in_bam = pysam.Samfile("-", "rb") else: fn = options.stdin.name options.stdin.close() in_bam = pysam.Samfile(fn, "rb") getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre) if options.cpm: scale_factor = sum(contig.mapped for contig in in_bam.get_index_statistics()) scale_factor = 1000000.0 / scale_factor if options.format == "bed": bedfile = IOTools.openFile(args[0], "w") else: plus_wig = tempfile.NamedTemporaryFile(delete=False) minus_wig = tempfile.NamedTemporaryFile(delete=False) contig_sizes = [] for chrom, chrom_length in zip(in_bam.references, in_bam.lengths): # get depths over chromosome pos_depth, neg_depth, counter = getter(chrom, strand="both", dtype=options.dtype) pos_depth_sorted = pos_depth.sort_index() del pos_depth neg_depth_sorted = neg_depth.sort_index() del neg_depth neg_depth_sorted = -1 * neg_depth_sorted if options.cpm: pos_depth_sorted = pos_depth_sorted * scale_factor neg_depth_sorted = neg_depth_sorted * scale_factor if options.cpm: pos_depth = pos_depth * scale_factor neg_depth = neg_depth * scale_factor # output to temporary wig file if options.format == "bed": output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile) else: outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig) outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig) contig_sizes.append([chrom, chrom_length]) del pos_depth_sorted del neg_depth_sorted if options.format == "bed": bedfile.close() else: plus_wig_name = plus_wig.name minus_wig_name = minus_wig.name plus_wig.close() minus_wig.close() outname_plus = args[0] + "_plus" outname_minus = args[0] + "_minus" if options.format == "bedgraph": E.debug("Outputting to bedGraph") shutil.move(plus_wig_name, outname_plus + ".bg") shutil.move(minus_wig_name, outname_minus + ".bg") elif options.format == "bigwig": chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".") contig_sizes = ["\t".join(map(str, row)) for row in contig_sizes] contig_sizes = "\n".join(contig_sizes) + "\n" chrom_sizes_file.write(contig_sizes) chrom_sizes_filename = chrom_sizes_file.name chrom_sizes_file.close() outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename) outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename) # write footer and output benchmark information. E.Stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=( "plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=( "plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option("--with-ranges", dest="with_ranges", action="store_true", help="output alignment ranges (suffix /from-to after identifier) [default=%default].") parser.add_option("--without-ranges", dest="with_ranges", action="store_false", help="do not output alignment ranges (suffix /from-to after identifier) [default=%default].") parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option("-m", "--method", dest="methods", type="string", help="""methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default].") parser.add_option("-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap(open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile( open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tablename = outfile[:-len(".load")] tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match(":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.iteritems(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() #start, end = int(start), int(end) #match.start += start #match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join( map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ --map=base_qualities:text \ < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float", help="minimum overlap [%default]") parser.add_option("-k", "--keep-temp", dest="keep_temp", action="store_true", help="do not delete temporary files [%default]") parser.add_option("-a", "--filename-bam", dest="filename_bam", metavar="bam", type="string", help="bam-file to use [%default]") parser.add_option("-b", "--filename-bed", dest="filename_bed", metavar="bam", type="string", help="bed-file to use [%default]") parser.set_defaults( min_overlap=0.5, keep_temp=False, filename_bam=None, filename_bed=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filename_bam = options.filename_bam filename_bed = options.filename_bed if filename_bam is None and filename_bed is None: if len(args) != 2: raise ValueError( "please supply a bam and a bed file or two bed-files.") filename_bam, filename_bed = args if filename_bed is None: raise ValueError("please supply a bed file to compare to.") if filename_bam is None: raise ValueError("please supply a bam file to compare with.") E.info("intersecting the two files") tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.close() tmpfilename = tmpfile.name min_overlap = options.min_overlap options.stdout.write("category\talignments\n") # get number of columns of reference bed file for bed in Bed.iterator(IOTools.openFile(filename_bed)): ncolumns_bed = bed.columns break E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed)) if ncolumns_bed < 4: raise ValueError("please supply a name attribute in the bed file") # get information about if filename_bam.endswith(".bam"): format = "-abam" samfile = pysam.Samfile(filename_bam, "rb") total = samfile.mapped # latest bedtools uses bed12 format when bam is input ncolumns_bam = 12 # count per read sort_key = lambda x: x.name else: format = "-a" total = IOTools.getNumLines(filename_bam) # get bed format ncolumns_bam = 0 for bed in Bed.iterator(IOTools.openFile(filename_bam)): ncolumns_bam = bed.columns break if ncolumns_bam > 0: E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam)) if ncolumns_bam == 3: # count per interval sort_key = lambda x: (x.contig, x.start, x.end) else: # count per interval category sort_key = lambda x: x.name # use fields for bam/bed file (regions to count with) data_fields = ["contig", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blockstarts", "blockends"][:ncolumns_bam] # add fields for second bed (regions to count in) data_fields.extend(["contig2", "start2", "end2", "name2", "score2", "strand2", "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed]) # add bases overlap data_fields.append("bases_overlap") data = collections.namedtuple("data", data_fields) options.stdout.write("total\t%i\n" % total) if total == 0: E.warn("no data in %s" % filename_bam) return # IMS: newer versions of intersectBed have a very high memory requirement unless # passed sorted bed files. statement = """intersectBed %(format)s %(filename_bam)s -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n) -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals() E.info("running %s" % statement) retcode = E.run(statement) if retcode != 0: raise ValueError("error while executing statement %s" % statement) infile = open(tmpfilename, "r") counts_per_alignment = collections.defaultdict(int) E.info("counting") take_columns = len(data._fields) def iter(infile): for line in infile: if not line.strip(): continue yield data._make(line[:-1].split()[:take_columns]) for read, overlaps in itertools.groupby(iter(infile), key=sort_key): annotations = [x.name2 for x in overlaps] for anno in annotations: counts_per_alignment[anno] += 1 infile.close() for key, counts in counts_per_alignment.iteritems(): options.stdout.write("%s\t%i\n" % (key, counts)) if not options.keep_temp: os.unlink(tmpfilename) # write footer and output benchmark information. E.Stop()
def buildResults(bedfile, fg_file, control_file, counter, options): '''compute densities and peakshape parameters.''' options.stdout.write("\t".join( ("contig", "start", "end", "name", "\t".join(_bam2peakshape.PeakShapeResult._fields))) + "\n") if options.window_size: # bins are centered at peak-center and then stretching outwards. bins = numpy.arange(-options.window_size + options.bin_size // 2, +options.window_size, options.bin_size) #contigs = set(pysam_in.references) strand_specific = options.strand_specific result = [] c = E.Counter() c.input = 0 for bed in Bed.iterator(IOTools.openFile(bedfile)): c.input += 1 # if bed.contig not in contigs: # c.skipped += 1 # continue if c.input % options.report_step == 0: E.info("iteration: %i" % c.input) features = counter.countInInterval( fg_file, bed.contig, bed.start, bed.end, window_size=options.window_size, bins=bins, only_interval=options.only_interval, centring_method=options.centring_method) if control_file: control = counter.countAroundPos(control_file, bed.contig, features.peak_center, bins=features.bins) else: control = None if options.random_shift: direction = numpy.random.randint(0, 2) if direction: pos = features.peak_center + 2 * bins[0] else: pos = features.peak_center + 2 * bins[-1] shifted = counter.countAroundPos(fg_file, bed.contig, pos, bins=features.bins) else: shifted = None if strand_specific and bed.strand == "-": features._replace(hist=hist[::-1]) if control: control._replace(hist=hist[::-1]) if shifted: shift._replace(hist=hist[::-1]) result.append((features, bed, control, shifted)) c.added += 1 E.info("interval processing: %s" % c) return result, bins
def compileMutationalSignature(infiles, outfiles): '''takes a list of mutect output files and compiles per sample mutation signatures''' delim = ":" def lookup(b1, b2): '''return lookup key for a pair of bases''' return(b1 + delim + b2) def breakKey(key): '''take a lookup key and return the elements''' return key.split(delim) def comp(base): '''return complementary base''' comp_dict = {"C": "G", "G": "C", "A": "T", "T": "A"} return comp_dict[base] def getID(infile): return P.snip(os.path.basename(infile), ".mutect.snp.annotated.filtered.vcf") outfile1 = IOTools.openFile(outfiles[0], "w") mutations = ["C:T", "C:A", "C:G", "A:C", "A:T", "A:G"] outfile1.write("%s\t%s\t%s\t%s\t%s\n" % ("patient_id", "base_change", "ref", "alt", "frequency")) patient_freq = {} for infile in infiles: patient_id = getID(infile) mut_dict = {} for comb in mutations: mut_dict[comb] = 0 with IOTools.openFile(infile, "r") as f: for line in f.readlines(): if line.startswith('#'): continue values = line.split("\t") key = lookup(values[3], values[4]) if key in mut_dict: mut_dict[key] += 1 else: comp_key = lookup( comp(values[3]), comp(values[4])) mut_dict[comp_key] += 1 patient_freq[patient_id] = mut_dict for mutation in mutations: base1, base2 = breakKey(mutation) for infile in infiles: patient_id = getID(infile) outfile1.write("%s\t%s\t%s\t%s\t%s\n" % (patient_id, mutation, base1, base2, patient_freq[patient_id] [mutation])) outfile1.close() outfile2 = IOTools.openFile(outfiles[1], "w") outfile2.write("%s\t%s\n" % ("patient_id", "\t".join(mutations))) for infile in infiles: patient_id = getID(infile) frequencies = "\t".join(map(str, [patient_freq[patient_id][x] for x in mutations])) outfile2.write("%s\t%s\n" % (patient_id, frequencies)) outfile2.close()
def filterMutect(infile, outfile, logfile, min_t_alt, min_n_depth, max_n_alt_freq, min_t_alt_freq, min_ratio): ''' filter MuTect2 snps and indels''' reasons = collections.Counter() control_id = "NORMAL" tumour_id = "TUMOR" def comp(base): '''return complementary base''' comp_dict = {"C": "G", "G": "C", "A": "T", "T": "A"} return comp_dict[base] with IOTools.openFile(outfile, "w") as outf: with IOTools.openFile(infile, "r") as inf: for line in inf.readlines(): # need to find location of control and tumor columns if line.startswith('#CHROM'): columns = line.split("\t") for x in range(0, len(columns)): if control_id in columns[x]: control_col = x elif tumour_id in columns[x]: tumor_col = x if line.startswith('#'): # write out all comment lines outf.write(line) if line.startswith('chr'): values = line.split("\t") if values[6] == "PASS": t_values = values[tumor_col].split(":") t_ref, t_alt = list( map(float, (t_values[1].split(",")))) t_depth = t_alt + t_ref n_values = values[control_col].split(":") n_ref, n_alt = list( map(float, (n_values[1].split(",")))) n_depth = n_alt + n_ref np.seterr(divide='ignore') t_freq = np.divide(t_alt, t_depth) n_freq = np.divide(n_alt, n_depth) # filter if not t_alt > min_t_alt: reasons["Low_tumour_alt_count"] += 1 continue if not t_freq >= min_t_alt_freq: reasons["Low_tumour_alt_freq"] += 1 continue if not n_depth >= min_n_depth: reasons["Low_normal_depth"] += 1 continue if not n_freq <= max_n_alt_freq: reasons["high_normal_alt_freq"] += 1 continue if (np.divide(t_freq, n_freq) >= min_ratio or n_freq == 0): outf.write(line) else: reasons["Mutect_reject"] += 1 with IOTools.openFile(logfile, "w") as outf: outf.write("%s\n" % "\t".join(("reason", "count"))) for reason in reasons: outf.write("%s\t%i\n" % (reason, reasons[reason]))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.Start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") options.filename1, options.filename2 = args table1 = CSV.readTable(IOTools.openFile(options.filename1, "r")) table2 = CSV.readTable(IOTools.openFile(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = IOTools.convertDictionary(row) writer.writerow(row) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern for filenames.") parser.set_defaults( output_pattern=None, format="%5.2f", ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) != 2: raise "please supply to filenames with the clusters." map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(open(args[0]), both_directions=True) map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(open(args[1]), both_directions=True) graph = networkx.Graph() for a in map_cluster2ids1.keys(): graph.add_node((1, a)) for b in map_cluster2ids2.keys(): graph.add_node((2, b)) ## build graph between clusters for cluster1, ids1 in map_cluster2ids1.items(): for id1 in ids1: if id1 in map_id2cluster2: graph.add_edge((1, cluster1), (2, map_id2cluster2[id1])) components = networkx.connected_components(graph) ####################################################### ####################################################### ####################################################### ## write components and compute counts ####################################################### outfile = getFile("components", options) outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n") n = 0 counts = {} subsets = [] for component in components: m1, m2 = [], [] for x in component: if x[0] == 1: m1.append(x[1]) else: m2.append(x[1]) t = len(component) n1 = len(m1) n2 = len(m2) cc = (n1, n2) if cc not in counts: counts[cc] = 0 counts[cc] += 1 if cc == (1, 1): subsets.append(n) n += 1 outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" % (n, t, n1, n2, ",".join(m1), ",".join(m2))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### ## write counts ####################################################### outfile = getFile("counts", options) outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n") for cc, c in counts.items(): outfile.write( "%i\t%i\t%i\t%s\t%s\n" % (cc[0], cc[1], c, options.format % (100.0 * float(c) / len(map_cluster2ids1)), options.format % (100.0 * float(c) / len(map_cluster2ids2)))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### ## analyze subsets - how many of the 1:1 clusters ## contain the exact members? ####################################################### outfile = getFile("subsets", options) outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n") ntrue = 0 nrest1 = 0 nrest2 = 0 nother = 0 for component_id in subsets: component = components[component_id] if component[0][0] == 1: id1, id2 = component[0][1], component[1][1] else: id1, id2 = component[1][1], component[0][1] members1 = set(map_cluster2ids1[id1]) members2 = set(map_cluster2ids2[id2]) union = len(members1.union(members2)) intersection = len(members1.intersection(members2)) rest1 = len(members1.difference(members2)) rest2 = len(members2.difference(members1)) if rest1 == 0 and rest2 == 0: ntrue += 1 elif rest1 == 0: nrest1 += 1 elif rest2 == 0: nrest2 += 1 else: nother += 1 outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" % (component_id, len(members1), len(members2), union, intersection, rest1, rest2)) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ## write subset statistics ntotal = len(subsets) options.stdout.write("# subset statistics of 1:1 corresponding clusters\n") options.stdout.write("class\tcounts\ttotal\n") options.stdout.write("%s\t%i\t%s\n" % ("total", ntotal, options.format % 100)) options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format % (100.0 * ntrue / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format % (100.0 * nrest1 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format % (100.0 * nrest2 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format % (100.0 * nother / ntotal))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--in-sam", dest="in_sam", action="store_true", help="Input file is in sam format", default=False) parser.add_option("-o", "--out-sam", dest="out_sam", action="store_true", help="Output alignments in sam format", default=False) parser.add_option("--ignore-umi", dest="ignore_umi", action="store_true", help="Ignore UMI and dedup only on position", default=False) parser.add_option("--subset", dest="subset", type="string", help="Use only a fraction of reads, specified by subset", default=1.1) parser.add_option("--spliced-is-unique", dest="spliced", action="store_true", help="Treat a spliced read as different to an unspliced" " one", default=False) parser.add_option("--soft-clip-threshold", dest="soft", type="float", help="number of bases clipped from 5' end before" "read os counted as spliced", default=4) parser.add_option("--edit-distance-theshold", dest="threshold", type="int", help="Edit distance theshold at which to join two UMIs" "when clustering", default=1) parser.add_option("--chrom", dest="chrom", type="string", help="Restrict to one chromosome", default=None) parser.add_option("--paired", dest="paired", action="store_true", default=False, help="Use second-in-pair position when deduping") parser.add_option("--method", dest="method", type="choice", choices=("adjacency", "directional-adjacency", "percentile", "unique", "cluster"), default="directional-adjacency", help="method to use for umi deduping") parser.add_option("--output-stats", dest="stats", type="string", default=False, help="Specify location to output stats") parser.add_option("--further-stats", dest="further_stats", action="store_true", default=False, help="Output further stats") parser.add_option("--per-contig", dest="per_contig", action="store_true", default=False, help=("dedup per contig," " e.g for transcriptome where contig = gene")) parser.add_option( "--whole-contig", dest="whole_contig", action="store_true", default=False, help= "Read whole contig before outputting bundles: guarantees that no reads" "are missed, but increases memory usage") parser.add_option("--multimapping-detection-method", dest="detection_method", type="choice", choices=("NH", "X0", "XT"), default=None, help=("Some aligners identify multimapping using bam " "tags. Setting this option to NH, X0 or XT will " "use these tags when selecting the best read " "amongst reads with the same position and umi")) parser.add_option("--mapping-quality", dest="mapping_quality", type="int", help="Minimum mapping quality for a read to be retained", default=0) parser.add_option("--read-length", dest="read_length", action="store_true", default=False, help=("use read length in addition to position and UMI" "to identify possible duplicates")) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.stdin != sys.stdin: in_name = options.stdin.name options.stdin.close() else: raise ValueError("Input on standard in not currently supported") if options.stdout != sys.stdout: out_name = options.stdout.name options.stdout.close() else: out_name = "-" if options.in_sam: in_mode = "r" else: in_mode = "rb" if options.out_sam: out_mode = "w" else: out_mode = "wb" if options.stats: if options.ignore_umi: raise ValueError("'--output-stats' and '--ignore-umi' options" " cannot be used together") if options.further_stats: if not options.stats: raise ValueError("'--further-stats' options requires " "'--output-stats' option") if options.method not in ["cluster", "adjacency"]: raise ValueError("'--further-stats' only enabled with 'cluster' " "and 'adjacency' methods") infile = pysam.Samfile(in_name, in_mode) outfile = pysam.Samfile(out_name, out_mode, template=infile) nInput, nOutput = 0, 0 if options.detection_method: bam_features = detect_bam_features(infile.filename) if not bam_features[options.detection_method]: if sum(bam_features.values()) == 0: raise ValueError( "There are no bam tags available to detect multimapping. " "Do not set --multimapping-detection-method") else: raise ValueError( "The chosen method of detection for multimapping (%s) " "will not work with this bam. Multimapping can be detected" " for this bam using any of the following: %s" % (options.detection_method, ",".join( [x for x in bam_features if bam_features[x]]))) if options.stats: # set up arrays to hold stats data stats_pre_df_dict = {"UMI": [], "counts": []} stats_post_df_dict = {"UMI": [], "counts": []} pre_cluster_stats = [] post_cluster_stats = [] pre_cluster_stats_null = [] post_cluster_stats_null = [] topology_counts = collections.Counter() node_counts = collections.Counter() read_gn = random_read_generator(infile.filename) for bundle in get_bundles(infile, ignore_umi=options.ignore_umi, subset=float(options.subset), quality_threshold=options.mapping_quality, paired=options.paired, chrom=options.chrom, spliced=options.spliced, soft_clip_threshold=options.soft, per_contig=options.per_contig, whole_contig=options.whole_contig, read_length=options.read_length, detection_method=options.detection_method): nOutput += 1 nInput += sum([bundle[umi]["count"] for umi in bundle]) if nOutput % 10000 == 0: E.debug("Outputted %i" % nOutput) if nInput % 1000000 == 0: E.debug("Read %i input reads" % nInput) if options.stats: # generate pre-dudep stats average_distance = get_average_umi_distance(bundle.keys()) pre_cluster_stats.append(average_distance) cluster_size = len(bundle) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) pre_cluster_stats_null.append(average_distance_null) if options.ignore_umi: for umi in bundle: outfile.write(bundle[umi]["read"]) # IMS: add paired output for ignore_umi: if options.paired: outfile.write(infile.mate(bundle[umi]["read"])) else: # set up ClusterAndReducer functor with methods specific to # specified options.method processor = ClusterAndReducer(options.method) # dedup using umis and write out deduped bam reads, umis, umi_counts, topologies, nodes = processor( bundle, options.threshold, options.stats, options.further_stats) for read in reads: outfile.write(read) if options.paired: # TS - write out paired end mate outfile.write(infile.mate(read)) if options.stats: # collect pre-dudupe stats stats_pre_df_dict['UMI'].extend(bundle) stats_pre_df_dict['counts'].extend( [bundle[UMI]['count'] for UMI in bundle]) # collect post-dudupe stats post_cluster_umis = [x.qname.split("_")[-1] for x in reads] stats_post_df_dict['UMI'].extend(umis) stats_post_df_dict['counts'].extend(umi_counts) average_distance = get_average_umi_distance(post_cluster_umis) post_cluster_stats.append(average_distance) cluster_size = len(post_cluster_umis) random_umis = read_gn.getUmis(cluster_size) average_distance_null = get_average_umi_distance(random_umis) post_cluster_stats_null.append(average_distance_null) if options.further_stats: for c_type, count in topologies.most_common(): topology_counts[c_type] += count for c_type, count in nodes.most_common(): node_counts[c_type] += count if options.stats: stats_pre_df = pd.DataFrame(stats_pre_df_dict) stats_post_df = pd.DataFrame(stats_post_df_dict) # generate histograms of counts per UMI at each position UMI_counts_df_pre = pd.DataFrame( stats_pre_df.pivot_table(columns=stats_pre_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_post = pd.DataFrame( stats_post_df.pivot_table(columns=stats_post_df["counts"], values="counts", aggfunc=len)) UMI_counts_df_pre.columns = ["instances"] UMI_counts_df_post.columns = ["instances"] UMI_counts_df = pd.merge(UMI_counts_df_pre, UMI_counts_df_post, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - if count value not observed either pre/post-dedup, # merge will leave an empty cell and the column will be cast as a float # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html # --> Missing data casting rules and indexing # so, back fill with zeros and convert back to int UMI_counts_df = UMI_counts_df.fillna(0).astype(int) UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv", sep="\t") # aggregate stats pre/post per UMI agg_pre_df = aggregateStatsDF(stats_pre_df) agg_post_df = aggregateStatsDF(stats_post_df) agg_df = pd.merge(agg_pre_df, agg_post_df, how='left', left_index=True, right_index=True, sort=True, suffixes=["_pre", "_post"]) # TS - see comment above regarding missing values agg_df = agg_df.fillna(0).astype(int) agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t") # bin distances into integer bins max_ed = int( max( map(max, [ pre_cluster_stats, post_cluster_stats, pre_cluster_stats_null, post_cluster_stats_null ]))) cluster_bins = range(-1, int(max_ed) + 2) def bin_clusters(cluster_list, bins=cluster_bins): ''' take list of floats and return bins''' return np.digitize(cluster_list, bins, right=True) def tallyCounts(binned_cluster, max_edit_distance): ''' tally counts per bin ''' return np.bincount(binned_cluster, minlength=max_edit_distance + 3) pre_cluster_binned = bin_clusters(pre_cluster_stats) post_cluster_binned = bin_clusters(post_cluster_stats) pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null) post_cluster_null_binned = bin_clusters(post_cluster_stats_null) edit_distance_df = pd.DataFrame({ "unique": tallyCounts(pre_cluster_binned, max_ed), "unique_null": tallyCounts(pre_cluster_null_binned, max_ed), options.method: tallyCounts(post_cluster_binned, max_ed), "%s_null" % options.method: tallyCounts(post_cluster_null_binned, max_ed), "edit_distance": cluster_bins }) # TS - set lowest bin (-1) to "Single_UMI" edit_distance_df['edit_distance'][0] = "Single_UMI" edit_distance_df.to_csv(options.stats + "_edit_distance.tsv", index=False, sep="\t") if options.further_stats: with IOTools.openFile(options.stats + "_topologies.tsv", "w") as outf: outf.write("\n".join([ "\t".join((x, str(y))) for x, y in topology_counts.most_common() ]) + "\n") with IOTools.openFile(options.stats + "_nodes.tsv", "w") as outf: outf.write("\n".join([ "\t".join(map(str, (x, y))) for x, y in node_counts.most_common() ]) + "\n") # write footer and output benchmark information. E.info("Number of reads in: %i, Number of reads out: %i" % (nInput, nOutput)) E.Stop()
def FilterExacCols(infile, exac_suffs, exac_thresh): ''' Returns a set of line indices indicating lines where either of the alleles called have a frequency of greater that exac_thresh in any of the populations specified as exac_suffs. Where no data is available an allele frequency of -1 is used. Exac provide data as AC_xxx and AN_xxx where AC is the allele count - the number of times the allele has been called - and AN is chromosome count - the number of samples in which the allele could have been called - in population xxx. AC / AN = allele frequecy. exac_suffs are any columns where an AC_xxx and AN_xxx column is provided in the VCF, e.g. Adj will calculate allele frequency from the AC_Adj and AN_Adj columns ''' # read columns from the input VCF exac_suffs = exac_suffs.split(",") cols = IOTools.openFile(infile).readline().strip().split("\t") nD = dict() afdict = dict() for e in exac_suffs: # find the columns with the appropriate information # Allele count AC_i = cols.index("AC_%s" % (e)) # Allele Number AN_i = cols.index("AN_%s" % (e)) # Genotype GT_i = cols.index('GT') nlist = set() n = 0 AFS = [] with IOTools.openFile(infile) as input: for line in input: if n > 1: line = line.strip().split("\t") # At multi-allelic sites, comma delimited AC and AN values # are provided # "." and "NA" indicate no data here - this is represented # as an AF of -1 AC = line[AC_i].replace(".", "-1").replace( "NA", "-1").split(",") AN = line[AN_i].replace(".", "1").replace( "NA", "1").split(",") AC = np.array([float(a) for a in AC]) AN = np.array([float(a) for a in AN]) AF = AC / AN AF2 = [af if af > 0 else 0 for af in AF] AF = np.insert(AF, 0, (1 - sum(AF2))) # Chromosome count is usually the same for all minor # alleles (but not always) # If it is not the same the AC and AN lists should be the # same length # Otherwise AN will have length 1 if len(AC) != len(AN): AN = [AN] * len(AC) # Record the genotype called in this sample for this SNP GT = line[GT_i] GT = GT.replace(".", '0') GT = GT.split("/") GT[0], GT[1] = int(GT[0]), int(GT[1]) # If the variant is not in ExAC the ExAC columns show "." # but the site # may still have been called as multi allelic # - use -1 for all frequencies # in this case if max(GT) > (len(AF) - 1): AF = np.array([-1] * (max(GT) + 1)) AF1 = AF[GT[0]] AF2 = AF[GT[1]] AFS.append((AF1, AF2)) # Remember where both allele frequencies are # greater than exac_thresh if AF1 >= exac_thresh and AF2 >= exac_thresh: nlist.add(n) else: AFS.append(('NA', 'NA')) n += 1 afdict[e] = AFS nD[e] = nlist ns = set.union(*list(nD.values())) return afdict, ns
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' to_cluster = True # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/regions2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="pattern to look for sequence filename.") parser.add_option( "-i", "--ids", dest="ids", type="string", help= "comma separated list of prediction ids. Use 'all' to use all predictions." ) parser.add_option("-f", "--filename-ids", dest="filename_ids", type="string", help="filename with prediction ids.") parser.add_option("-t", "--type", dest="type", type="choice", choices=("genes", "mrnas", "introns", "intronic", "exons", "exonic", "intergenic", "exons-third-codons"), help="type to output.") parser.add_option( "-e", "--extend-region", dest="extend_region", type="int", help="regions are extended by this margin at either end.") parser.add_option( "-r", "--shorten-region", dest="shorten_region", type="int", help="regions are shortened by this margin at either end.") parser.add_option("-m", "--min-length", dest="min_length", type="int", help="minimum length of segment.") parser.add_option("-s", "--schema", dest="schema", type="string", help="schema to take data from.") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "table", "region"), help="output formats.") parser.add_option("--fasta-format", dest="fasta_format", type="choice", choices=("id-coordinates", "coordinates", "schema-coordinates"), help="output formats for fasta formatted headers.") parser.add_option("--orthologs", dest="orthologs", action="store_true", help="lookup up orthologs of prediction ids.") parser.add_option("--multiple", dest="multiple", action="store_true", help="""lookup up predictions in multiple species. Identifiers should be given as schema|prediction_id[|additional_fields]. Note that the genome file locations have to be consistent.""" ) parser.add_option("--id-format", dest="id_format", type="choice", choices=("id", "schema-id", "full"), help="output format for ids.") parser.add_option("--taboo-regions", dest="taboo_regions", type="choice", choices=("same", "both"), help="check for overlap in same/both strands.") parser.add_option("--filename-taboo-regions", dest="filename_taboo_regions", type="string", help="filename with information about taboo regions.") parser.add_option( "--filename-properties", dest="filename_properties", type="string", help= "filename with mapping information between features and properties.") parser.add_option( "--invert-properties", dest="invert-properties", action="store_true", help= "instead of printing features which have properties, print those that have not." ) parser.add_option( "--output-coordinate-format", dest="output_coordinate_format", type="choice", choices=("full", "long"), help= """output format of coordinates. Output format is contig:strand:from:to in zero based /forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field""" ) parser.set_defaults(genome_file="genome", identifiers=None, filename_ids="-", ids=None, extend_region=0, shorten_region=0, tablename_predictions="predictions", tablename_exons="exons", tablename_genes="genes", tablename_quality="quality", schema=None, output_format="fasta", fasta_format="id-coordinates", type="mrnas", min_length=1, id_format="id", mmultiple=False, separator="|", filename_taboo_regions=False, output_coordinate_format="full", filename_properties=None, invert_property=False, report_step=10000) (options, args) = E.Start(parser, add_psql_options=True) if options.orthologs: options.id_format = "schema-id" # database handle for connecting to postgres dbhandle = pgdb.connect(options.psql_connection) # Step 1 : Input of predictions # read identifiers from file, command line arguments or stdin. if options.ids in ("all", "nr"): prediction_ids = options.ids if options.loglevel >= 1: options.stdlog.write("# using all prediction ids.\n") options.stdlog.flush() elif options.ids: prediction_ids = options.ids.split(",") elif len(args) > 0: prediction_ids = args elif options.filename_ids: prediction_ids = [] if options.filename_ids == "-": prediction_ids += IOTools.ReadList(sys.stdin)[0] elif options.filename_ids: prediction_ids += IOTools.ReadList(open(options.filename_ids, "r"))[0] if len(prediction_ids) == 0: raise "no prediction identifiers given." if options.loglevel >= 1: options.stdlog.write("# read %i prediction ids.\n" % len(prediction_ids)) options.stdlog.flush() if options.filename_taboo_regions: # Note: the input has to be in forward coordinates in order for option # "both" to work. taboo_regions = Regions.RegionFilter() if options.taboo_regions == "both": ignore_strand = True else: ignore_strand = False taboo_regions.readFromFile(open(options.filename_taboo_regions, "r"), ignore_strand=ignore_strand) else: taboo_regions = None map_feature2property = getMapFeature2Property(options) processPredictions(dbhandle, options.schema, options, prediction_ids, taboo_regions, map_feature2property) E.Stop()
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.toTable(outfile) tmpfile = P.getTempFile(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s -b sqlite --index=id --index=motif --index=id,motif --table=%(tablename)s --allow-empty --map=base_qualities:text < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("edger", "deseq2", "mock"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--contrasts", dest="contrasts", action="append", help=("contrasts for post-hoc testing writen as comma " "seperated list `condition,replicate` etc")) parser.set_defaults( input_filename_tags="-", input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=None, save_r_environment=None, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrasts=None, output_filename_pattern=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # assert options.input_filename_design and os.path.exists( # options.input_filename_design) # assert options.output_filename_pattern, "specify --output-filename-pattern" # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts(pd.io.parsers.read_csv( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests outfile_prefix = options.output_filename_pattern + options.method if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, disperion=options.edger_dispersion, ref_group=options.ref_group, contrasts=options.contrasts, outfile_prefix=outfile_prefix) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, fdr=options.fdr) results.getResults(fdr=options.fdr) results.summariseDEResults() for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix) results.plotMA(contrast, outfile_prefix=outfile_prefix) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) # write out summary tables for each comparison/contrast for test_group in results.Summary.keys(): outf = IOTools.openFile("_".join( [outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() E.Stop()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) cc = dbhandle.cursor() if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.quote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=( "geneprofile", "tssprofile", "utrprofile", "intervalprofile", "midpointprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ), help='counters to use. Counters describe the meta-gene structure to use ' '[%default]. \n Note using geneprofilewithintrons, or geneprofileabsolutedistancefromthreeprimeend will automatically turn on the --base-accuracy option' ) parser.add_option( "-b", "--bamfile", "--bedfile", "--bigwigfile", dest="infiles", metavar="BAM", type="string", action="append", help="BAM/bed/bigwig files to use. Do not mix different types" "[%default]") parser.add_option( "-c", "--controlfile", dest="controlfiles", metavar="BAM", type="string", action="append", help= "control/input to use. Should be of the same type as the bam/bed/bigwig file" " [%default]") parser.add_option("-g", "--gtffile", dest="gtffile", type="string", metavar="GTF", help="GTF file to use. " "[%default]") parser.add_option( "-n", "--normalization", dest="normalization", type="choice", choices=("none", "max", "sum", "total-max", "total-sum"), help= "normalization to apply on each transcript profile before adding to meta-gene profile. " "[%default]") parser.add_option( "-p", "--normalize-profile", dest="profile_normalizations", type="choice", action="append", choices=("all", "none", "area", "counts", "background"), help="normalization to apply on meta-gene profile normalization. " "[%default]") parser.add_option( "-r", "--reporter", dest="reporter", type="choice", choices=("gene", "transcript"), help="report results for genes or transcripts." " When 'genes` is chosen, exons across all transcripts for" " a gene are merged. When 'transcript' is chosen, counts are" " computed for each transcript separately with each transcript" " contributing equally to the meta-gene profile." " [%default]") parser.add_option( "-i", "--shift", dest="shifts", type="int", action="append", help= "shift reads in :term:`bam` formatted file before computing densities (ChIP-Seq). " "[%default]") parser.add_option( "-a", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge pairs in :term:`bam` formatted file before computing" " densities (ChIP-Seq)." "[%default]") parser.add_option( "-u", "--base-accuracy", dest="base_accuracy", action="store_true", help="compute densities with base accuracy. The default is to" " only use the start and end of the aligned region (RNA-Seq)" " [%default]") parser.add_option( "-e", "--extend", dest="extends", type="int", action="append", help="extend reads in :term:`bam` formatted file (ChIP-Seq). " "[%default]") parser.add_option("--resolution-upstream", dest="resolution_upstream", type="int", help="resolution of upstream region in bp " "[%default]") parser.add_option("--resolution-downstream", dest="resolution_downstream", type="int", help="resolution of downstream region in bp " "[%default]") parser.add_option("--resolution-upstream-utr", dest="resolution_upstream_utr", type="int", help="resolution of upstream UTR region in bp " "[%default]") parser.add_option("--resolution-downstream-utr", dest="resolution_downstream_utr", type="int", help="resolution of downstream UTR region in bp " "[%default]") parser.add_option("--resolution-cds", dest="resolution_cds", type="int", help="resolution of cds region in bp " "[%default]") parser.add_option("--resolution-introns", dest="resolution_introns", type="int", help="resolution of introns region in bp " "[%default]") parser.add_option( "--resolution-exons-absolute-distance-topolya", dest="resolution_exons_absolute_distance_topolya", type="int", help="resolution of exons absolute distance topolya in bp " "[%default]") parser.add_option( "--resolution-introns-absolute-distance-topolya", dest="resolution_introns_absolute_distance_topolya", type="int", help="resolution of introns absolute distance topolya in bp " "[%default]") parser.add_option( "--extension-exons-absolute-distance-topolya", dest="extension_exons_absolute_distance_topolya", type="int", help= "extension for exons from the absolute distance from the topolya in bp" "[%default]") parser.add_option( "--extension-introns-absolute-distance-topolya", dest="extension_introns_absolute_distance_topolya", type="int", help= "extension for introns from the absolute distance from the topolya in bp" "[%default]") parser.add_option("--extension-upstream", dest="extension_upstream", type="int", help="extension upstream from the first exon in bp" "[%default]") parser.add_option("--extension-downstream", dest="extension_downstream", type="int", help="extension downstream from the last exon in bp" "[%default]") parser.add_option("--extension-inward", dest="extension_inward", type="int", help="extension inward from a TSS start site in bp" "[%default]") parser.add_option("--extension-outward", dest="extension_outward", type="int", help="extension outward from a TSS start site in bp" "[%default]") parser.add_option("--scale-flank-length", dest="scale_flanks", type="int", help="scale flanks to (integer multiples of) gene length" "[%default]") parser.add_option( "--matrix-format", dest="matrix_format", type="choice", choices=("multiple", "single"), help="matrix output format, either 'multiple' files or a 'single' file " "[%default]") parser.add_option( "--control-factor", dest="control_factor", type="float", help="factor for normalizing control and fg data. Computed from data " "if not set. " "[%default]") parser.add_option( "--output-all-profiles", dest="output_all_profiles", action="store_true", help="keep individual profiles for each transcript and output. " "[%default]") parser.add_option( "--input-filename-counts", dest="input_filename_counts", type="string", help="filename with count data for each transcript. Use this instead " "of recomputing the profile. Useful for plotting the meta-gene profile " "from previously computed counts " "[%default]") parser.add_option( "--background-region", dest="background-region", type="int", help="number of bins on either side of the profile to be considered " "for background meta-gene normalizatian " "[%default]") parser.set_defaults( remove_rna=False, ignore_pairs=False, force_output=False, bin_size=10, extends=[], shifts=[], sort=[], reporter="transcript", resolution_cds=1000, resolution_introns=1000, resolution_exons_absolute_distance_topolya= 3000, #3kb is a good balance of seeing long enough 3 prime bias and not omit too many genes. Tim 31th Aug 2013 resolution_introns_absolute_distance_topolya= 500, #introns is only for assess the noise level, thus do ont need a long region, a long region has the side effect of omit more genes. Tim 31th Aug 2013 extension_exons_absolute_distance_topolya= 3000, #3kb is a good balance of seeing long enough 3 prime bias and not omit too many genes. Tim 31th Aug 2013 extension_introns_absolute_distance_topolya= 500, #introns is only for assess the noise level, thus do ont need a long region, a long region has the side effect of omit more genes. Tim 31th Aug 2013 resolution_upstream_utr=1000, resolution_downstream_utr=1000, resolution_upstream=1000, resolution_downstream=1000, # mean length of transcripts: about 2.5 kb extension_upstream=2500, extension_downstream=2500, extension_inward=3000, extension_outward=3000, plot=True, methods=[], infiles=[], controlfiles=[], gtffile=None, profile_normalizations=[], normalization=None, scale_flanks=0, merge_pairs=False, min_insert_size=0, max_insert_size=1000, base_accuracy=False, matrix_format="single", control_factor=None, output_all_profiles=False, background_region=10, input_filename_counts=None, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # Keep for backwards compatability if len(args) == 2: infile, gtf = args options.infiles.append(infile) options.gtffile = gtf if not options.gtffile: raise ValueError("no GTF file specified") if options.gtffile == "-": options.gtffile = options.stdin else: options.gtffile = IOTools.openFile(options.gtffile) if len(options.infiles) == 0: raise ValueError("no bam/wig/bed files specified") for methodsRequiresBaseAccuracy in [ "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ]: # If you implemented any methods that you do not want the spliced out introns # or exons appear to be covered by non-existent reads, it is better you let those # methods imply --base-accurarcy by add them here. if methodsRequiresBaseAccuracy in options.methods: options.base_accuracy = True if options.reporter == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile)) elif options.reporter == "transcript": gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile)) # Select rangecounter based on file type if len(options.infiles) > 0: if options.infiles[0].endswith(".bam"): bamfiles = [pysam.Samfile(x, "rb") for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Samfile(x, "rb") for x in options.controlfiles ] else: controlfiles = None format = "bam" if options.merge_pairs: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, merge_pairs=options.merge_pairs, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, controfiles=controlfiles, control_factor=options.control_factor) elif options.shifts or options.extends: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, controlfiles=controlfiles, control_factor=options.control_factor) elif options.base_accuracy: range_counter = _bam2geneprofile.RangeCounterBAMBaseAccuracy( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) else: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bed.gz"): bedfiles = [pysam.Tabixfile(x) for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Tabixfile(x) for x in options.controlfiles ] else: controlfiles = None format = "bed" range_counter = _bam2geneprofile.RangeCounterBed( bedfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bw"): wigfiles = [BigWigFile(file=open(x)) for x in options.infiles] format = "bigwig" range_counter = _bam2geneprofile.RangeCounterBigWig(wigfiles) else: raise NotImplementedError("can't determine file type for %s" % bamfile) counters = [] for method in options.methods: if method == "utrprofile": counters.append( _bam2geneprofile.UTRCounter( range_counter, options.resolution_upstream, options.resolution_upstream_utr, options.resolution_cds, options.resolution_downstream_utr, options.resolution_downstream, options.extension_upstream, options.extension_downstream, )) elif method == "geneprofile": counters.append( _bam2geneprofile.GeneCounter( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofilewithintrons": counters.append( _bam2geneprofile.GeneCounterWithIntrons( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofileabsolutedistancefromthreeprimeend": counters.append( _bam2geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd( range_counter, options.resolution_upstream, options.resolution_downstream, options.resolution_exons_absolute_distance_topolya, options.resolution_introns_absolute_distance_topolya, # options.resolution_exons_absolute_distance_tostartsite, # options.resolution_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime bias is of your interest. #(you need to create another class). It is not very difficult to derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile already implemented, because in this future feature introns are skipped, options.extension_upstream, options.extension_downstream, options.extension_exons_absolute_distance_topolya, options.extension_introns_absolute_distance_topolya, # options.extension_exons_absolute_distance_tostartsite, # options.extension_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime bias is of your interest. #(you need to create another class). It is not very difficult to derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile already implemented, because in this future feature introns are skipped, options.scale_flanks)) elif method == "tssprofile": counters.append( _bam2geneprofile.TSSCounter(range_counter, options.extension_outward, options.extension_inward)) elif method == "intervalprofile": counters.append( _bam2geneprofile.RegionCounter(range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "midpointprofile": counters.append( _bam2geneprofile.MidpointCounter(range_counter, options.resolution_upstream, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # set normalization for c in counters: c.setNormalization(options.normalization) if options.output_all_profiles: c.setOutputProfiles( IOTools.openFile( E.getOutputFile(c.name) + ".profiles.tsv.gz", "w")) if options.input_filename_counts: # read counts from file E.info("reading counts from %s" % options.input_filename_counts) all_counts = pandas.read_csv(IOTools.openFile( options.input_filename_counts), sep='\t', header=0, index_col=0) if len(counters) != 1: raise NotImplementedError( 'counting from matrix only implemented for 1 counter.') # build counter based on reference counter counter = _bam2geneprofile.UnsegmentedCounter(counters[0]) counters = [counter] _bam2geneprofile.countFromCounts(counters, all_counts) else: E.info("starting counting with %i counters" % len(counters)) _bam2geneprofile.countFromGTF(counters, gtf_iterator) # output matrices if not options.profile_normalizations: options.profile_normalizations.append("none") elif "all" in options.profile_normalizations: options.profile_normalizations = [ "none", "area", "counts", "background" ] for method, counter in zip(options.methods, counters): if options.matrix_format == "multiple": # output multiple files, each containing results of one normalization for norm in options.profile_normalizations: with IOTools.openFile( E.getOutputFile(counter.name) + ".%s.tsv.gz" % norm, "w") as outfile: counter.writeMatrix( outfile, normalize=norm, background_region=options.background_region) elif options.matrix_format == "single": # build a single output matrices = [] for norm in options.profile_normalizations: # build matrix, apply normalization matrix = counter.buildMatrix( normalize=norm, background_region=options.background_region) nrows, ncols = matrix.shape matrix.shape = (nrows * ncols, 1) matrices.append(matrix) for x in range(1, len(matrices)): assert matrices[0].shape == matrices[x].shape # build a single matrix matrix = numpy.hstack(matrices) nrows, ncols = matrix.shape with IOTools.openFile( E.getOutputFile(counter.name) + ".matrix.tsv.gz", "w") as outfile: outfile.write( "bin\tregion\tregion_bin\t%s\n" % "\t".join( \ options.profile_normalizations) ) fields = [] bins = [] for field, nbins in zip(counter.fields, counter.nbins): fields.extend([field] * nbins) bins.extend(list(range(nbins))) for row, cols in enumerate(zip(fields, bins, matrix)): outfile.write("%i\t%s\t" % (row, "\t".join([str(x) for x in cols[:-1]]))) outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]]))) with IOTools.openFile( E.getOutputFile(counter.name) + ".lengths.tsv.gz", "w") as outfile: counter.writeLengthStats(outfile) if options.output_all_profiles: counter.closeOutputProfiles() if options.plot: import matplotlib # avoid Tk or any X matplotlib.use("Agg") import matplotlib.pyplot as plt for method, counter in zip(options.methods, counters): if method in ("geneprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "utrprofile", "intervalprofile"): plt.figure() plt.subplots_adjust(wspace=0.05) max_scale = max([max(x) for x in counter.aggregate_counts]) for x, counts in enumerate(counter.aggregate_counts): plt.subplot(5, 1, x + 1) plt.plot(range(len(counts)), counts) plt.title(counter.fields[x]) plt.ylim(0, max_scale) figname = counter.name + ".full" fn = E.getOutputFile(figname) + ".png" plt.savefig(os.path.expanduser(fn)) plt.figure() points = [] cuts = [] for x, counts in enumerate(counter.aggregate_counts): points.extend(counts) cuts.append(len(counts)) plt.plot(range(len(points)), points) xx, xxx = 0, [] for x in cuts: xxx.append(xx + x // 2) xx += x plt.axvline(xx, color="r", ls="--") plt.xticks(xxx, counter.fields) figname = counter.name + ".detail" fn = E.getOutputFile(figname) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "tssprofile": plt.figure() plt.subplot(1, 3, 1) plt.plot( range(-options.extension_outward, options.extension_inward), counter.aggregate_counts[0]) plt.title(counter.fields[0]) plt.subplot(1, 3, 2) plt.plot( range(-options.extension_inward, options.extension_outward), counter.aggregate_counts[1]) plt.title(counter.fields[1]) plt.subplot(1, 3, 3) plt.title("combined") plt.plot( range(-options.extension_outward, options.extension_inward), counter.aggregate_counts[0]) plt.plot( range(-options.extension_inward, options.extension_outward), counter.aggregate_counts[1]) plt.legend(counter.fields[:2]) fn = E.getOutputFile(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "midpointprofile": plt.figure() plt.plot(numpy.arange(-options.resolution_upstream, 0), counter.aggregate_counts[0]) plt.plot(numpy.arange(0, options.resolution_downstream), counter.aggregate_counts[1]) fn = E.getOutputFile(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("--bin", dest="bin", action="store_true", help="output average in bins across the interval") parser.add_option("-n", "--num-bins", dest="bin_number", type=int, help="number of bins for coverage profile") parser.add_option("-o", "--output-filename-prefix", dest="output_filename_prefix", help="pattern to write coverage bins to") parser.set_defaults(bin=False, bin_number=10) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) inf = options.stdin coverage_result = collections.defaultdict(list) E.info("reading in coverage data") for line in inf.readlines(): data = line[:-1].split("\t") contig, coverage = data[0], data[2] coverage_result[contig].append(coverage) E.info("read %i contigs" % len(list(coverage_result.keys()))) options.stdout.write("contig\tcov_mean\tcov_sd\n") if options.bin: outf = IOTools.openFile(options.output_filename_prefix + ".binned", "w") outf.write( "%s" % "\t".join([str(i) for i in range(1, options.bin_number + 1, 1)]) + "\n") for contig, coverage in coverage_result.items(): coverage = list(map(float, coverage)) options.stdout.write( "%s\t%s\t%s\n" % (contig, str(np.mean(coverage)), str(np.std(coverage)))) if options.bin: bin_means = [] bins = np.linspace(0, len(coverage), options.bin_number + 1) if len(coverage) < len(bins) - 1: E.warn("will not calculate coverage means for %s: too short" % contig) continue for i in range(len(bins)): try: bin_mean = np.mean(coverage[int(bins[i]):int(bins[i + 1])]) except IndexError: continue bin_means.append(bin_mean) outf.write(contig + "\t" + "\t".join(map(str, bin_means)) + "\n") outf.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--gi-accessions", dest="gi_accessions", type="string", help="list of gi accession numbers") parser.add_option("-m", "--ncbi-map", dest="ncbi_map", type="string", help="ncbi.map file downloaded from the MEGAN website") parser.add_option("-n", "--nucl-map", dest="nucl_map", type="string", help="gi mapping to tax id downloaded from ncbi website") parser.add_option("-c", "--taxa-code", dest="taxa_code", type="string", help="code for different levels of the taxonomy downloaded from the MEGAN website") parser.add_option("-t", "--tree", dest="tree", type="string", help="description of parents in the taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) E.info("reading gi accession numbers") gi_accessions = set() for line in IOTools.openFile(options.gi_accessions).readlines(): gi_accessions.add(line[:-1]) E.info("read gi accession numbers") E.info("building gi2taxid map") gi2taxid = {} c_gi = 0 for line in IOTools.openFile(options.nucl_map).readlines(): data = line[:-1].split("\t") if data[0] not in gi_accessions: continue else: c_gi += 1 gi2taxid[data[0]] = data[1] E.info("built gi2taxid for %i gi accession numbers" % c_gi) E.info("building code map") code2taxa = {} for line in IOTools.openFile(options.taxa_code).readlines(): data = line[:-1].split("\t") code2taxa[data[0]] = data[1] E.info("built taxa code map") E.info("building taxa2name map") taxid2name = {} for line in IOTools.openFile(options.ncbi_map).readlines(): data = line[:-1].split("\t") # keep the taxa code taxid2name[data[0]] = (data[1], data[3]) E.info("built taxa2name map") E.info("build taxid2parentmap") taxid2parents = {} for line in IOTools.openFile(options.tree).readlines(): data = line[:-1].split("\t") data = [x for x in data if x != "|"] taxid2parents[data[0]] = data[1] E.info("built taxid2parentmap") E.info("retrieving parents for each gi accession number") options.stdout.write( "gi\tsub_species\tspecies\tgenus\tfamily\torder\tclass\tphylum\n") for gi, taxid in gi2taxid.items(): # this will be the sub species id # walk through the parents parents = {} sub_species = taxid2name[taxid][0] for i in range(len(list(code2taxa.keys()))): parent_taxid = taxid2parents[taxid] parent_name = taxid2name[parent_taxid][0] parent_code = taxid2name[parent_taxid][1] # ignore codes that we are not interested in if parent_code not in list(code2taxa.keys()): continue parent_taxa = code2taxa[parent_code] parents[parent_taxa] = parent_name taxid = parent_taxid if "genus" not in parents: genus = "NA" else: genus = parents["genus"] if "family" not in parents: family = "NA" else: family = parents["family"] if "order" not in parents: order = "NA" else: order = parents["order"] if "class" not in parents: _class = "NA" else: _class = parents["class"] if "phylum" not in parents: phylum = "NA" else: phylum = parents["phylum"] if phylum.find("<phylum>") != -1: phylum = phylum.replace(" <phylum>", "") if "species" not in parents: species = "NA" else: species = parents["species"] options.stdout.write("\t".join([gi, sub_species.replace(" ", "_"), species.replace( " ", "_"), genus, family, order, _class, phylum]) + "\n") # write footer and output benchmark information. E.Stop()
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=( "make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults( pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s')) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster and HAS_DRMAA: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options.exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError( "pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="set quality scores to format " "[default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern-identifier", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w") outfile_qual = IOTools.openFile(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.set_defaults() # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do sth assert len(args) == 3, "expected three command line arguments" fastqfile1, fastqfile2 = args[1], args[2] # only output compressed data if not fastqfile1.endswith(".gz"): fastqfile1 += ".gz" if not fastqfile2.endswith(".gz"): fastqfile2 += ".gz" samfile = pysam.Samfile(args[0], "rb") tmpdir = tempfile.mkdtemp() outtemp1 = os.path.join(tmpdir, "pair1.gz") outtemp2 = os.path.join(tmpdir, "pair2.gz") outstream1 = IOTools.openFile(outtemp1, "w") outstream2 = IOTools.openFile(outtemp2, "w") E.info('writing fastq files to temporary directory %s' % tmpdir) found1, found2 = set(), set() read1_qlen, read2_qlen = 0, 0 c = E.Counter() for read in samfile.fetch(): c.input += 1 if read.is_read1: if read.qname not in found1: outstream1.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found1.add(read.qname) if not read1_qlen: read1_qlen = read.qlen c.output1 += 1 elif read.is_read2: if read.qname not in found2: outstream2.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found2.add(read.qname) if not read2_qlen: read2_qlen = read.qlen c.output2 += 1 for qname in found2.difference(found1): outstream1.write("\t".join((qname, "N" * read1_qlen, "B" * read1_qlen)) + "\n") c.extra1 += 1 for qname in found1.difference(found2): outstream2.write("\t".join((qname, "N" * read2_qlen, "B" * read2_qlen)) + "\n") c.extra2 += 1 E.info("%s" % str(c)) outstream1.close() outstream2.close() E.info("sorting fastq files") statement = '''zcat %s | sort -k1,1 | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}' | gzip > %s''' E.run(statement % (outtemp1, fastqfile1)) E.run(statement % (outtemp2, fastqfile2)) # write footer and output benchmark information. E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-s", "--species", dest="species", type="string", help="schema of master species." ) parser.set_defaults( tablename_orthologs = "orthology_pairwise1v5.orthologlinks_first", filename_ids = "-", schemas = None, species = None, ) (options, args) = E.Start( parser, add_psql_options = True ) dbhandle = pgdb.connect( options.psql_connection ) if options.filename_ids == "-": ids, errors = IOTools.ReadList(sys.stdin) extra_options = ["schema1 = '%s'" % options.species, "prediction_id1 IN ('%s')" % "','".join( ids ) ] if options.schemas: extra_options.append( "schema2 IN ('%s')" % "','".join(options.schemas)) statement = """SELECT prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 FROM %s WHERE schema1 != schema2 AND %s ORDER BY prediction_id1""" % (options.tablename_orthologs, " AND ".join(extra_options)) cc = dbhandle.cursor() cc.execute(statement) result = cc.fetchall() cc.close() if options.schemas: schemas = options.schemas else: schemas = set( map( lambda x: x[1], result) ) ## compute counts degeneracies = {} for x in ids: degeneracies[x] = {} for s in schemas: degeneracies[x][s] = (0,0,0,0) for prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 in result: degeneracies[prediction_id1][schema2] = (gd1, gd2, td1, td2) ## output options.stdout.write("%s\t%s\n" % ("prediction_id", "\t".join(schemas))) for x in ids: options.stdout.write("%s" % x) for s in schemas: options.stdout.write("\t%s:%s:%s:%s" % degeneracies[x][s]) options.stdout.write("\n") E.Stop()
def intersectionHeatmap(infiles, outfile): ''' calculate the intersection between the infiles and plot''' pandas2ri.activate() name2genes = {} df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"]) ix = 0 for inf in infiles: name = P.snip(os.path.basename(inf)).split(".")[0] name = name.replace(".", "_") with IOTools.openFile(inf, "r") as f: genes = set() for line in f: if line[0] == "#": continue values = line.strip().split("\t") info = values[7].split(";") for x in info: if x.split("=")[0] == "SNPEFF_GENE_NAME": gene_name = x.split("=")[1] break # if no gene name found, line is skipped if gene_name: genes.update((gene_name,)) name2genes[name] = genes df.loc[ix] = [name, name, len(genes), 1.0] ix += 1 for pair in itertools.permutations(list(name2genes.keys()), 2): id_1, id_2 = pair intersection = len(name2genes[id_1].intersection(name2genes[id_2])) not_intersecting = len( name2genes[id_1].symmetric_difference(name2genes[id_2])) intersection_perc = float(intersection) / (intersection + not_intersecting) df.loc[ix] = [id_1, id_2, intersection, intersection_perc] ix += 1 variant = os.path.basename(outfile).replace( "overlap_", "").replace("_heatmap.png", "") plotIntersectionHeatmap = R(''' function(df){ library(ggplot2) m_txt = element_text(size=15) m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1) l_txt = element_text(size=20) p = ggplot(df, aes(id_1, id_2, fill=100*perc)) + geom_tile() + geom_text(aes(label=intersection), size=3) + scale_fill_gradient(name="Intersection (%%)", limits=c(0,100), low="yellow", high="dodgerblue4") + theme(axis.text.x = m_txt_90, axis.text.y = m_txt, legend.text = m_txt, legend.title = m_txt, aspect.ratio=1) + xlab("") + ylab("") + ggtitle("%(variant)s") ggsave("%(outfile)s", width=10, height=10) }''' % locals()) plotIntersectionHeatmap(df)
def filterQuality(infile, qualstr, qualfilter, outfiles): ''' Filter variants based on quality. Columns to filter on and how they should be filtered can be specified in the pipeline.ini. Currently only implemented to filter numeric columns. "." is assumed to mean pass. ''' columns = IOTools.openFile(infile).readline() columns = columns.split("\t") qualparams = qualstr.split(",") qualdict = dict() fdict = dict() for param in qualparams: param = param.split("'") # column to filter on col = param[0] # string of >, <, >= or <= depending how the column should # be filtered lessmore = param[1] # score to filter by score = float(param[2]) assert col in columns, "column %s not in variant table" % col ind = columns.index(col) i = 0 iset = set([0, 1]) with IOTools.openFile(infile) as input: for line in input: # rows one and two are headers if i > 1: line = line.strip().split("\t") if line[ind] == ".": iset.add(i) elif lessmore == ">": if float(line[ind]) > score: iset.add(i) elif lessmore == ">=": if float(line[ind]) > score: iset.add(i) elif lessmore == "<": if float(line[ind]) < score: iset.add(i) elif lessmore == "<=": if float(line[ind]) <= score: iset.add(i) if i not in iset: fdict.setdefault(i, []) fdict[i].append("%s=%s" % (col, line[ind])) i += 1 qualdict[col] = iset if qualfilter == "all": allqual = set.intersection(*list(qualdict.values())) elif qualfilter == "any": allqual = set.union(*list(qualdict.values())) i = 0 out = IOTools.openFile(outfiles[0], "w") out2 = IOTools.openFile(outfiles[1], "w") with IOTools.openFile(infile) as input: for line in input: if i in allqual: out.write(line) else: line = line.strip() out2.write("%s\t%s\n" % (line, ",".join(fdict[i]))) i += 1 out.close() out2.close()
def extractEBioinfo(eBio_ids, vcfs, outfile): '''find the number of mutations identitified in previous studies (eBio_ids) for the mutated genes in the vcfs''' genes = set() for vcf in vcfs: infile = VCF.VCFFile(IOTools.openFile(vcf)) for vcf_entry in infile: # assumes all vcf entries without "REJECT" are "PASS" if vcf_entry.filter != "REJECT": info_entries = vcf_entry.info.split(";") for entry in info_entries: if "SNPEFF_GENE_NAME" in entry: genes.update((entry.split("=")[1],)) eBio_ids = IOTools.openFile(eBio_ids, "r") tissue_counts = collections.defaultdict( lambda: collections.defaultdict( lambda: collections.defaultdict(int))) def chunks(l, n): ''' Yield successive n-sized chunks from l ''' for i in range(0, len(l), n): yield l[i:i + n] # delete me E.info("number of genes: %i" % len(list(genes))) for line in eBio_ids: tissue, study, table = line.strip().split("\t") n = 0 for i in range(0, len(list(genes)), 250): genes_chunk = list(genes)[i:i + 250] # TS sporadic error when querying with a single gene at a time # "urllib2.URLError: <urlopen error [Errno 110] Connection timed out>" # max URL length appears to be 8200 characters, # try doing 250 genes at a time? gene_list = "+".join(list(genes_chunk)) n += len(genes_chunk) E.info("number of genes processed: %i" % n) url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&" "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&" "gene_list=%(gene_list)s" % locals()) df = pd.io.parsers.read_csv( url, comment="#", sep="\t", index_col=0) for gene in genes_chunk: tmp_df = df[df['COMMON'] == gene] # check dataframe contains data! if tmp_df.shape[0] != 0: # seem to be having issues with gene set containing duplicates! # --> dataframe with repeated instances of gene after selection # so splice to first row and recreate dataframe from series if tmp_df.shape[0] > 1: tmp_df = pd.DataFrame(tmp_df.iloc[0]).T tissue_counts[tissue][gene]["total"] += tmp_df.shape[1] - 2 tissue_counts[tissue][gene][ "mutations"] += int(tmp_df.count(1)) - 1 out = IOTools.openFile(outfile, "w") tissues = list(tissue_counts.keys()) out.write("gene\t%s\n" % "\t".join([ "%s_frequency" % x.replace(" ", "_") for x in tissues])) for gene in genes: freq_values = [] for tissue in tissues: total = tissue_counts[tissue][gene]["total"] mutations = tissue_counts[tissue][gene]["mutations"] freq_values.append(round(np.divide(float(mutations), total), 4)) out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values)))) out.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--set-nh", dest="set_nh", action="store_true", help="sets the NH flag. The file needs to be " "sorted by readname [%default]") parser.add_option("--unset-unmapped-mapq", dest="unset_unmapped_mapq", action="store_true", help="sets the mapping quality of unmapped " "reads to 0 [%default]") parser.add_option("--set-sequence", dest="set_sequence", action="store_true", help="sets the sequence to 'A's (a valid base) and " "the quality to 'F's " ",which is defined in all fastq scoring schemes " "[%default]") parser.add_option("--strip", dest="strip", type="choice", choices=("sequence", "quality", "match"), help = "remove parts of the bam-file. Note that " "stripping the sequence will " "also strip the quality values [%default]") parser.add_option("--unstrip", dest="unstrip", action="store_true", help="add sequence and quality into bam file [%default]") parser.add_option("--filter", dest="filter", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help = "filter bam file. The option denotes " "the property that is " "used to determine better match [%default]") parser.add_option("--reference-bam", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--fastq1", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired [%default]") parser.add_option("--fastq2", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair [%default]") parser.add_option("--keep-first-base", dest="keep_first_base", action="store_true", help="keep first base of reads such that gtf2table.py " "will only consider the " "first base in its counts.") parser.set_defaults( filter=[], set_nh=False, unset_unmapped_mapq=False, output_sam=False, reference_bam=None, strip=None, unstrip=None, force=False, set_sequence=False, inplace=False, fastq_pair1=None, fastq_pair2=None, keep_first_base=False ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) bamfiles = [] if options.stdin != sys.stdin: bamfiles.append(options.stdin.name) if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.Samfile(bamfile, "rb") if bamfile == "-": if options.output_sam: pysam_out = pysam.Samfile("-", "wh", template=pysam_in) else: pysam_out = pysam.Samfile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in) if options.filter: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter: remove_mismatches = True elif "CM" in options.filter: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.Samfile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam( pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter, remove_unique="non-unique" in options.filter, remove_contigs=None, remove_unmapped="mapped" in options.filter, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if options.unset_unmapped_mapq: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if options.set_nh and False: def set_nh(i): for key, reads in itertools.groupby(i, lambda x: x.qname): l = list(reads) nh = len(l) for read in l: if not read.is_unmapped: t = dict(read.tags) t['NH'] = nh read.tags = list(t.iteritems()) yield read it = set_nh(it) if options.set_sequence: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if options.strip is not None: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip == "sequence": it = strip_sequence(it) pre_check_f = check_sequence elif options.strip == "quality": it = strip_quality(it) pre_check_f = check_quality elif options.strip == "match": it = strip_match(it) if options.unstrip: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.Fastqfile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if options.set_nh: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 if options.keep_first_base: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()