def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("motif\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.is_empty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method,track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if IOTools.is_empty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.is_empty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if IOTools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) statement = "gzip < %(tmpfile)s > %(outfile)s" P.run(statement) shutil.rmtree(tmpdir) os.unlink(tmpfile)
def buildNUMTs(infile, outfile): '''output set of potential nuclear mitochondrial genes (NUMTs). This function works by aligning the mitochondrial chromosome against genome using exonerate_. This can take a while. Arguments --------- infile : string Ignored. outfile : filename Output in :term:`gtf` format with potential NUMTs. ''' if not PARAMS["numts_mitochrom"]: E.info("skipping numts creation") IOTools.touch_file(outfile) return fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) if PARAMS["numts_mitochrom"] not in fasta: E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"]) IOTools.touch_file(outfile) return tmpfile_mito = P.get_temp_filename(".") statement = ''' cgat index_fasta --extract=%(numts_mitochrom)s --log=%(outfile)s.log %(genome_dir)s/%(genome)s > %(tmpfile_mito)s ''' P.run(statement) if IOTools.is_empty(tmpfile_mito): E.warn("mitochondrial genome empty.") os.unlink(tmpfile_mito) IOTools.touch_file(outfile) return format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi", "C") format = "\\\\t".join(["%%%s" % x for x in format]) # collect all results min_score = 100 statement = ''' cat %(genome_dir)s/%(genome)s.fasta | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(tmpfile_mito)s --model affine:local --score %(min_score)i --showalignment no --showsugar no --showcigar no --showvulgar no --ryo \\"%(format)s\\n\\" " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) # convert to gtf inf = IOTools.open_file("%s.links.gz" % outfile) outf = IOTools.open_file(outfile, "w") min_score = PARAMS["numts_score"] c = E.Counter() for line in inf: (query_contig, query_strand, query_start, query_end, target_contig, target_strand, target_start, target_end, score, pid, alignment) = line[:-1].split("\t") c.input += 1 score = int(score) if score < min_score: c.skipped += 1 continue if target_strand == "-": target_start, target_end = target_end, target_start gff = GTF.Entry() gff.contig = target_contig gff.start, gff.end = int(target_start), int(target_end) assert gff.start < gff.end gff.strand = target_strand gff.score = int(score) gff.feature = "numts" gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end) gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end) outf.write("%s\n" % str(gff)) c.output += 1 inf.close() outf.close() E.info("filtering numts: %s" % str(c)) os.unlink(tmpfile_mito)
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run(statement) return tmpfile1 = P.get_temp_filename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | cgat gff2fasta --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run(statement) if IOTools.is_empty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) IOTools.touch_file(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run(statement) os.unlink(tmpfile1) inf = IOTools.open_file("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """) ]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ((len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.open_file(outfile, "w") inf = GTF.iterator(IOTools.open_file(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq", "downsample-single", "downsample-paired", "add-sequence-error"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', "mapped", "unique", "non-unique", "remove-list", "keep-list", "error-rate", "min-read-length", "min-average-base-quality"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--downsample", dest="downsample", type="int", help="Number of reads to downsample to") parser.add_option( "--filename-read-list", dest="filename_read_list", type="string", help= "Filename with list of reads to filter if 'keep-list' or 'remove-list' " "filter method is chosen [%default]") parser.add_option( "--error-rate", dest="error_rate", type="float", help="error rate to use as filter. Reads with an error rate " "higher than the threshold will be removed [%default]") parser.add_option("--minimum-read-length", dest="minimum_read_length", type="int", help="minimum read length when filtering [%default]") parser.add_option( "--minimum-average-base-quality", dest="minimum_average_base_quality", type="float", help="minimum average base quality when filtering [%default]") parser.set_defaults( methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, fastq_pair1=None, fastq_pair2=None, downsample=None, random_seed=None, filename_read_list=None, error_rate=None, minimum_read_length=0, minimum_average_base_quality=0, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.stdin != sys.stdin: bamfile = options.stdin.name if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods: if "remove-list" in options.filter_methods and "keep-list" in options.filter_methods: raise ValueError( "it is not possible to specify remove-list and keep-list") with IOTools.open_file(options.filename_read_list) as inf: filter_query_names = set( [x.strip() for x in inf.readlines() if not x.startswith("#")]) E.info("read query_sequence filter list with {} read names".format( len(filter_query_names))) if "error-rate" in options.filter_methods and not options.error_rate: raise ValueError( "filtering by error-rate requires --error-rate to be set") if "add-sequence-error" in options.methods and not options.error_rate: raise ValueError("--add-error-rate requires --error-rate to be set") E.info('processing %s' % bamfile) if IOTools.is_empty(bamfile): E.warn('ignoring empty file %s' % bamfile) E.stop() return # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.AlignmentFile(bamfile, "rb") if options.stdout != sys.stdout: output_bamfile = options.stdout.name else: output_bamfile = "-" if options.output_sam: pysam_out = pysam.AlignmentFile(output_bamfile, "wh", template=pysam_in) else: pysam_out = pysam.AlignmentFile(output_bamfile, "wb", template=pysam_in) if "filter" in options.methods: if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods: it = pysam_in.fetch(until_eof=True) c = E.Counter() if "remove-list" in options.filter_methods: for read in it: c.input += 1 if read.query_name in filter_query_names: c.skipped += 1 continue pysam_out.write(read) c.output += 1 elif "keep-list" in options.filter_methods: for read in it: c.input += 1 if read.query_name not in filter_query_names: c.skipped += 1 continue pysam_out.write(read) c.output += 1 E.info("category\tcounts\n%s\n" % c.asTable()) else: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if "min-length" in options.filter_methods and options.minimum_read_length == 0: raise ValueError( "please specify --minimum-read-length when using " "--filter-method=min-read-length") if "min-average-base-quality" in options.filter_methods and options.minimum_average_base_quality == 0: raise ValueError( "please specify --min-average-base-quality when " "using --filter-method=min-average-base-quality") if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = bam2bam_filter_bam( pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, filter_error_rate=options.error_rate, colour_mismatches=colour_mismatches, minimum_read_length=options.minimum_read_length, minimum_average_base_quality=options. minimum_average_base_quality) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) def nop(x): return None # function to check if processing should start pre_check_f = nop if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError("please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() E.stop() return if "downsample-single" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=None, single_end=True, random_seed=options.random_seed) it = down.downsample_single() if "downsample-paired" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=True, single_end=None, random_seed=options.random_seed) it = down.downsample_paired() if "add-sequence-error" in options.methods: def add_sequence_error(i): error_rate = options.error_rate map_nuc2var = {"A": "CGT", "C": "AGT", "G": "ACT", "T": "ACG"} for read in i: sequence = list(read.query_sequence) quals = read.query_qualities npos = int(math.floor(len(sequence) * error_rate)) positions = random.sample(range(len(sequence)), npos) for pos in positions: try: alt = map_nuc2var[sequence[pos]] except KeyError: continue sequence[pos] = alt[random.randint(0, len(alt) - 1)] read.query_sequence = "".join(sequence) read.query_qualities = quals yield read it = add_sequence_error(it) # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() # write footer and output benchmark information. E.stop()
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]): IOTools.touch_file(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run(statement) else: tmpfile = P.get_temp_filename(".") # need to merge incrementally fn = infiles[0] if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run(statement) for fn in infiles[1:]: if IOTools.is_empty(infiles[0]): IOTools.touch_file(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run(statement) statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run(statement) os.unlink(tmpfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq", "downsample-single", "downsample-paired"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--downsample", dest="downsample", type="int", help="Number of reads to downsample to") parser.set_defaults(methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, inplace=False, fastq_pair1=None, fastq_pair2=None, downsample=None, random_seed=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # random.seed(options.random_seed) bamfiles = [] if options.stdin != sys.stdin: from_stdin = True bamfiles.append(options.stdin.name) else: from_stdin = False if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] to_stdout = False for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.is_empty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.AlignmentFile(bamfile, "rb") if bamfile == "-" or (from_stdin and bamfile == options.stdin.name): to_stdout = True if options.output_sam: pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in) else: pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in) else: if IOTools.is_empty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.AlignmentFile(tmpfile.name, "wb", template=pysam_in) if "filter" in options.methods: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam(pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) if pysam_ref: pysam_ref.close() # do not write to stdlog in the middle of a SAM/BAM stdout stream. if options.stdlog != options.stdout: E.info("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue if "downsample-single" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=None, single_end=True, random_seed=options.random_seed) it = down.downsample_single() if "downsample-paired" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=True, single_end=None, random_seed=options.random_seed) it = down.downsample_paired() # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.stop()