def assembly(args): """Do Trinity assembly""" hp.echostep(args.step) # print args print(args) print # mkdir -p assembly_trinity hp.mkdirp(args.outputdir) # perform Trinity assembly if (args.single): cmd = 'Trinity --seqType fq --normalize_reads --min_contig_length={args.trinitythreshold} --max_memory {args.trinitymem}G --CPU {args.trinitycores} --output {args.outputdir} --single {args.mate1}'.format(args=args) else: cmd = 'Trinity --seqType fq --normalize_reads --min_contig_length={args.trinitythreshold} --max_memory {args.trinitymem}G --CPU {args.trinitycores} --output {args.outputdir} --left {args.mate1} --right {args.mate2}'.format(args=args) # use run_long_cmd for programs with verbose output hp.run_long_cmd(cmd, args.verbose, 'log.Trinity') print('Trinity complete') # name of expected output file myoutput = args.outputdir + '/Trinity.fasta' # exit if no output hp.check_file_exists_and_nonzero(myoutput) # mkdir -p hp.mkdirp('assembly') # rename Trinity contigs, join sequence portion of fasta, return number of contigs # cat ${outputdir}/Trinity.fasta | awk 'BEGIN{f=0; counter=1}{if ($0~/^>/) {if (f) {printf "\n"; counter++}; print ">contig_"counter; f=1} else printf $0}END{printf "\n"}' > ${output} myoutput2 = 'assembly/contigs_trinity.fasta' num_contigs = hp.fastajoinlines(myoutput, myoutput2, 'contig') # compute simple distribution # cat assembly/contigs_trinity.fasta | paste - - | awk '{print length($2)}' | sort -nr | ${d}/scripts/tablecount | awk -v tot=${num_contigs} 'BEGIN{x=0}{x+=$2; print $1"\t"$2"\t"x"/"tot"\t"int(100*x/tot)"%"}' > assembly/contigs.distrib.txt ahp.computedistrib(myoutput2, 'assembly/contigs.distrib.txt') if not int(args.noclean): cmd = 'rm -rf assembly_trinity' hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0) # return the name of assembly file return myoutput2
def blastnp(args): """Blastn or blastp""" hp.echostep(args.step) # extra blast flags flag = '' # if blastp, use blastp-fast if args.whichblast == 'blastp': flag = '-task blastp-fast' # do blastn or blastp cmd = '{args.whichblast} -outfmt "6 {args.fmt}" -query {args.input} -db {args.db} -num_threads {args.threads} {flag} > {args.outputdir}/blast_{args.sgeid}.result'.format(args=args, flag=flag) hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0)
def discovery(args): """ORF Discovery""" hp.echostep(args.step) # print args print(args) print # mkdir -p hp.mkdirp(args.outputdir) # find ORFs hp.getorf(args.input, args.outputdir + '/orf.fa', args.threshold) # check if output exists hp.check_file_exists_and_nonzero(args.outputdir + '/orf.fa') # if blast discovered ORFs if args.blast: # make directory hp.mkdirp(args.outputdir + '/blast') # define command: blastp to nr, if blast flag cmd = '{}/scripts/blast_wrapper.py --scripts {} --outputdir {} -i {} --logsdir {} --whichblast {} --threshold {} --db {} --id {} --noclean {}'.format( args.scripts, args.scripts, args.outputdir + '/blast', args.outputdir + '/orf.fa', 'logs_blast2', 'blastp', 100, args.db, args.id, args.noclean, ) hp.run_cmd(cmd, args.verbose, 1) hp.echostep(args.step, start=0)
def makerep(args): """ Make report """ # The goal of this function is to filter blast results based on taxid # Don't want human sequences or anything in the taxid blacklist, # which contains non-pathogens hp.echostep(args.step) # print args print(args) print # mkdir -p hp.mkdirp(args.outputdir) # defaults # taxid corresponding to h**o sapiens humantaxid = '9606' # desired header desiredfields = ['qseqid', 'sseqid', 'qlen','saccver','staxids','evalue', 'bitscore', 'stitle'] # taxid blacklist filterlist = set() # header header = [] # id 2 #reads dict (output of samtools idxstats) idx = {} # a dictionary to map taxon to sum of uniq reads, longest contig id, longest contig length, number of contigs taxonstats = collections.defaultdict(dict) # load blacklist if supplied if args.blacklist: with open(args.blacklist, 'r') as f: # final newline causes empty list elt filterlist = set([i for i in f.read().split('\n') if i]) # load idx file try: with open(args.id2reads, 'r') as f: for line in f: # map id to #reads idx[line.split()[0].strip()] = line.split()[2].strip() except: print('[Warning] Failed to load id2reads file') # load header with open(args.header, 'r') as f: header = map(str.strip, f.read().split()) # get indicies,fields in file myindicies = [] myfields = [] for j,k in enumerate(header): if k in desiredfields: myindicies.append(j) myfields.append(k) # get index of taxid, qseqid: taxidindex = myfields.index('staxids') qseqidindex = myfields.index('qseqid') qlenindex = myfields.index('qlen') # list of input files myfiles = [args.input] # if there's a valid file produced by the ORF discovery step, examine that file also if hp.check_path_bool(args.input2): myfiles.append(args.input2) # write file keyed on contig with open(args.outputdir + '/blast.topfilter.unsort.txt', 'w') as f: # print header: f.write('sampleid\t' + '\t'.join(myfields) + '\tnum_reads\n') # loop through inputs (blast, ORF discovery) for myfile in myfiles: with open(myfile, 'r') as g: for line in g: # don't want lines with predicted (as opposed to real) genes if 'PREDICTED' in line: continue # get desired fields fields = [line.split('\t')[i].strip() for i in myindicies] taxid = fields[taxidindex] qseqid = fields[qseqidindex] qlen = fields[qlenindex] # bypass human taxids or specifically filtered taxids if taxid == humantaxid or taxid in filterlist: continue # multiple taxids not supported if ';' in taxid: print('[WARNING] semicolon detected: multiple taxids not supported') # get read counts readcounts = idx.get(qseqid, '-') # write to file f.write(args.id + '\t' + '\t'.join(fields) + '\t' + readcounts + '\n') # set taxonstats dictionary taxonstats[taxid]['num'] = taxonstats[taxid].get('num', 0) + 1 taxonstats[taxid]['longest'] = qseqid taxonstats[taxid]['longestlength'] = max(taxonstats[taxid].get('longestlength', 0), int(qlen)) if not readcounts == '-': taxonstats[taxid]['sum'] = taxonstats[taxid].get('sum', 0) + int(readcounts) else: taxonstats[taxid]['sum'] = '-' # sort by staxids then qlen (with bash) # careful: you're including the header in the file (make sure it's sorted properly) cmd = 'sort -k5,5n -k6,6nr {args.outputdir}/blast.topfilter.unsort.txt > {args.outputdir}/{args.contigreport}'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) # get num reads to host num_reads_to_host = get_reads_mapped_to_host(args) if args.verbose: print('Number of reads properly mapped to host: {reads}'.format(reads=num_reads_to_host)) # print(dict(taxonstats)) # write file keyed on taxon with open(args.outputdir + '/' + args.taxonreport, 'w') as f: # print header: f.write('sampleid\ttaxid\tsum_reads\tnum_contigs\tid_longest_contig\tlen_longest_contig\tpathogen_reads/(host_reads/10^6)\n') for taxid in taxonstats: # pathogen reads / host reads sumdivhost = '-' # if num_reads_to_host nonzero if num_reads_to_host: sumdivhost = str(round(1000000*taxonstats[taxid]['sum']/float(num_reads_to_host),2)) mytaxonattributes = [str(taxonstats[taxid]['sum']), str(taxonstats[taxid]['num']), taxonstats[taxid]['longest'], str(taxonstats[taxid]['longestlength']), sumdivhost] f.write(args.id + '\t' + taxid + '\t' + '\t'.join(mytaxonattributes) + '\n') # generate and write html report if args.taxid2names != 'None': try: # makeHTML.generateHTML(args.outputdir + '/' + args.taxonreport, args.scripts, args.taxid2names, args.outputdir, args.hpc) makeHTML.generateHTML(args.outputdir + '/' + args.taxonreport, args.taxid2names, args.outputdir) except: print('[ERROR] makeHTML failed.') else: print('[WARNING] missing names.dmp, the file mapping taxids to names. HTML report will not be generated.') # if args.verbose: # print(dict(taxonstats)) if not args.noclean: os.remove(args.outputdir + '/blast.topfilter.unsort.txt') hp.echostep(args.step, start=0)
def hostsep(args): """Separate host reads""" # flags for STAR starflag='' # pipe into gunzip gunzip_pipe = '' # if input files are gzipped if args.gzip: starflag='--readFilesCommand zcat' gunzip_pipe = 'gunzip |' print('Counting input reads') cmd = 'cat {args.mate1} | {gunzip_pipe} wc -l | tr "\n" " " > {args.outputdir}/mapping_percent.txt'.format(args=args, gunzip_pipe=gunzip_pipe) hp.run_cmd(cmd, args.verbose, 0) cmd = 'echo {args.mate1} >> {args.outputdir}/mapping_percent.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) print('STAR mapping commenced') # Ioan: STAR option --outFilterMultimapNmax 1 to only output alignments if a read uniquely maps to reference; # HERE: allow up to 10 multi-hits reported --> will not affect feature counts downstream, nor Pandora results (b/c human multi-mapping here) # This option should not modify the downstream counts (of genes) with featureCounts, # which only counts features that are uniquely mapped (per BAM input marking info) if (args.single): cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(args=args, starflag=starflag) else: cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} {args.mate2} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(args=args, starflag=starflag) hp.run_cmd(cmd, args.verbose, 0) print('STAR mapping finished') print('find unmapped reads') cmd = 'samtools flagstat {args.outputdir}/Aligned.out.bam > {args.outputdir}/mapping_stats.STAR.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) # bin(13) = '0b1101', which corresponds to SAM flag bits: # read paired; read unmapped; mate unmapped if (args.single): ## Flag for unmapped single paired reads is 4 ## Samtools version compatibility issues: -o flag for output and .bam need to be specified cmd = 'samtools view -b -f 4 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(args=args) else: cmd = 'samtools view -b -f 13 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/star_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/star_unmapped {args.single}'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'wc -l {args.outputdir}/star_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) print('Bowtie2 mapping commenced') if (args.single): cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -U {args.outputdir}/star_unmapped_1.fastq -S {args.outputdir}/bwt2.sam'.format(args=args) else: cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -1 {args.outputdir}/star_unmapped_1.fastq -2 {args.outputdir}/star_unmapped_2.fastq -S {args.outputdir}/bwt2.sam'.format(args=args) # hp.run_cmd(cmd, args.verbose, 0) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('Bowtie2 mapping finished') cmd = 'samtools flagstat {args.outputdir}/bwt2.sam > {args.outputdir}/mapping_stats.bwt.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) print('find unmapped reads') ## Samtools version compatibility issues: -o flag for output and .bam need to be specified if (args.single): cmd = 'samtools view -S -b -f 4 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(args=args) else: cmd = 'samtools view -S -b -f 13 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/bwt2_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/bwt2_unmapped {args.single}'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'wc -l {args.outputdir}/bwt2_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) ## rename and zip both mates, or single mate if unpaired reads ## Ioan: filter short reads for i in ['1', '2']: if i=='1' or not (args.single): # Ioan found Trinity chokes if read length <= jellyfish kmer of 25 hp.fastqfilter( '{args.outputdir}/bwt2_unmapped_{i}.fastq'.format(args=args, i=i), '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), args.readlenfilter ) ## zipping the files cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) # if gtf variable set, get gene coverage if args.gtf: print('featureCounts commenced') cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.outputdir}/Aligned.out.bam'.format(args=args) # hp.run_cmd(cmd, args.verbose, 0) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('featureCounts finished') # TO DO: make this code more compact if not args.noclean: print('clean up') cmd = 'rm -rf {args.outputdir}/_STARtmp'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) for i in ['Aligned.out.bam', 'Log.*', 'SJ.out.tab', 'star_unmapped.bam', 'star_unmapped_*.fastq', 'bwt2.sam', 'bwt2_unmapped.bam']: cmd = 'rm {args.outputdir}/{i}'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0)
def getunmapped(args): """Starting with a .bam file, get the unmapped reads""" # fix violations of DRY (modify args variable) cmd = 'samtools flagstat {args.bam} > {args.outputdir}/mapping_stats.STAR.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) print('find unmapped reads') if (args.single): cmd = 'samtools view -b -f 4 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(args=args) else: cmd = 'samtools view -b -f 13 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/tmp_unmapped {args.single}'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) ## filter short reads for i in ['1', '2']: if i=='1' or not (args.single): hp.fastqfilter( '{args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args, i=i), '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), args.readlenfilter ) cmd = 'rm {args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) # if gtf variable set, get gene coverage if args.gtf: print('featureCounts commenced') cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.bam}'.format(args=args) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('featureCounts finished') # check output not empty, then zip both mates (or single file if unpaired reads) for i in ['1', '2']: if i=='1' or not (args.single): # check output not empty cmd = 'head {args.outputdir}/unmapped_{i}.fastq | wc -l'.format(args=args, i=i) numlines = hp.run_cmd(cmd, args.verbose, 1) if numlines == '0': print('[WARNING] No unmapped reads. Exiting') sys.exit(0) # zip cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) if not args.noclean: print('clean up') cmd = 'rm -rf ' + args.outputdir + '/' + 'unmapped.bam' hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0)
def blast(args): """Do blast in parallel""" hp.echostep(args.step) # print args print(args) print # mkdir -p hp.mkdirp(args.outputdir) args.noclean = int(args.noclean) # generate header with open(args.outputdir + '/header', 'w') as f: f.write(args.fmt.replace(' ', '\t') + '\n') # if no qsub if args.nosge: # filter fasta file on contigs above threshold length and hardcode name for blast.py # (splitting files doesnt make sense if no cluster) filecount = hp.fastafilter(args.input, args.outputdir + '/blast_1.fasta', args.threshold) if filecount == 0: print("No contigs above threshold. Exiting") sys.exit(1) # define log files logs_out = args.outputdir + '/' + 'blast_log_' + args.id + '.o' logs_err = args.outputdir + '/' + 'blast_log_' + args.id + '.e' # define command cmd = '{args.scripts}/scripts/blast.py --scripts {args.scripts} --outputdir {args.outputdir} --whichblast {args.whichblast} --db {args.db} --threads {args.threads} --fmt "{args.fmt}" --sgeid 1 > {o} 2> {e}'.format(args=args, o=logs_out, e=logs_err) hp.run_cmd(cmd, args.verbose, 0) # make link #cmd = 'ln -s blast_1.result {args.outputdir}/concat.txt'.format(args=args) #hp.run_cmd(cmd, args.verbose, 0) # get top hits #hp.tophitsfilter(args.outputdir + '/blast_1.result', args.outputdir + '/top.concat.txt') # get fasta file of entries that didn't blast #hp.getnohits(args.outputdir + '/top.concat.txt', args.outputdir + '/blast_1.fasta', args.outputdir + '/no_blastn.fa') # now filter blast results concat(args) # if qsub else: # mkdir -p hp.mkdirp(args.logsdir) # split fasta file on contigs above threshold length (and return number of contigs, file count) (numcontigs, filecount) = hp.fastasplit2(args.input, args.outputdir + '/blast', args.threshold, args.filelength) if filecount == 0: print("No contigs above threshold. Exiting") sys.exit(1) else: print("There are " + str(numcontigs) + " contigs above threshold, and " + str(filecount) + " files to blast.") # qsub part of command (array job) qcmd = 'qsub -S {mypython} -N bc_{args.id} -e {args.logsdir} -o {args.logsdir} -l mem={args.bmem}G,time={args.btime}:: -t 1-{filecount} '.format(mypython=sys.executable, args=args, filecount=filecount) #if args.hpc: # qcmd += ... # regular part of command cmd = '{args.scripts}/scripts/blast.py --scripts {args.scripts} --outputdir {args.outputdir} --whichblast {args.whichblast} --db {args.db} --threads {args.threads} --fmt "{args.fmt}"'.format(args=args) if args.verbose: print(qcmd + cmd) message = subprocess.check_output(qcmd + cmd, shell=True) print(message) # get job id jid = hp.getjid(message) # hold the script up here, until all the blast jobs finish # concat top blast hits; concat log files into one, so as not to clutter the file system # qsub part of command qcmd = 'qsub -V -b y -cwd -o log.out -e log.err -N wait_{args.id} -hold_jid {jid} -sync y echo wait_here'.format(args=args, jid=jid) message = subprocess.check_output(qcmd, shell=True) print(message) # now concatenate and filter blast results concat(args) hp.echostep(args.step, start=0)
def concat(args): """ Concatenate blast files and logs, so as not to leave many files messily scattered about. Also, implement Ioan's filtering """ print('CONCATENATE START') # define commands # file of all blast hits cmd = 'cat {args.outputdir}/*.result > {args.outputdir}/concat.txt'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) # all fasta entries cmd = 'cat {args.outputdir}/*.fasta > {args.outputdir}/above_threshold.fa'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) # set of all input IDs from the concatenated fasta file with open(args.outputdir + '/above_threshold.fa', 'r') as g: allids = {i[1:] for i in g.read().split('\n') if i and i[0] == '>'} # set of IDs seen so far seenids = set() # file of top blast hits tophitsfile = open(args.outputdir + '/top.concat.txt', 'w') # file filtered with Ioan's prescription ifilterfile = open(args.outputdir + '/ifilter.concat.txt', 'w') # Ioan: Top number of BLAST hits to parse through in order to determine whether top hit can be trusted as truly non-human topchunk = 10 # a counter minicounter = 0 # a line representing a top hit topline = None # a filtering boolean (if true, filter out line) filterbool = False # glob blast files myfiles = glob.glob(args.outputdir + '/*.result') f = fileinput.input(files=myfiles) for line in f: linelist = line.strip().split() myid = linelist[0] # ID not yet seen (i.e., is top hit) if not myid in seenids: tophitsfile.write(line) seenids.add(myid) # this will skip on the loop's first iteration if topline: # print previous top line if not filterbool: ifilterfile.write(topline + '\n') # here we're assuming fmt is: # qseqid, sseqid, saccver, staxids, pident, nident, length, mismatch, gapopen, gaps, qstart, qend, qlen, qframe, qcovs, sstart, send, slen, sframe, sstrand, evalue, bitscore, stitle topbitscore = float(linelist[21]) topstaxids = linelist[3] topline = line.strip() # reset counter minicounter = 0 # reset boolean (if tophit human, preemptively filter) if topstaxids == humantaxid: filterbool = True else: filterbool = False #print(myid + ' ' + str(minicounter) + ' ' + str(filterbool)) # keep on checking results if filter flag hasn't gone high and #lines < topchunk elif (not filterbool) and minicounter < topchunk: # here we're assuming fmt is: # qseqid, sseqid, saccver, staxids, pident, nident, length, mismatch, gapopen, gaps, qstart, qend, qlen, qframe, qcovs, sstart, send, slen, sframe, sstrand, evalue, bitscore, stitle staxids = linelist[3] evalue = float(linelist[20]) bitscore = float(linelist[21]) filterbool = ioanfilter(staxids, evalue, bitscore, topbitscore) minicounter += 1 #print(myid + ' ' + str(minicounter) + ' ' + str(filterbool)) # do last entry if not filterbool: ifilterfile.write(topline) f.close() tophitsfile.close() ifilterfile.close() # set of IDs that didn't blast # print(allids) # print(seenids) noblastids = allids - seenids # get fasta file of entries that didn't blast filecount = hp.fastaidfilter(args.outputdir + '/above_threshold.fa', args.outputdir + '/no_blastn.fa', noblastids) if not args.noclean: cmd = 'rm {args.outputdir}/*.result {args.outputdir}/*.fasta'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) print('No blast hits for: ' + ', '.join(list(noblastids))) # concat blast logs and remove folder print('concatenate blast logs') cmd = 'head -100 {args.logsdir}/* > {args.outputdir}/log.blast'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) if not args.noclean: shutil.rmtree(args.logsdir) print('CONCATENATE END')
def remap(args, contigs): """map contigs back onto assembly""" hp.echostep('remap') hp.mkdirp('assembly/ref_remap') refbowtie="assembly/ref_remap/ref" cmd = 'bowtie2-build {} {}'.format(contigs, refbowtie) hp.run_cmd(cmd, args.verbose, 0) if (args.single): cmd = 'bowtie2 -p 4 -x {} -U {} -S {}'.format(refbowtie, args.mate1, 'assembly/reads2contigs.sam') else: cmd = 'bowtie2 -p 4 -x {} -1 {} -2 {} -S {}'.format(refbowtie, args.mate1, args.mate2, 'assembly/reads2contigs.sam') hp.run_cmd(cmd, args.verbose, 0) # convert to bam ## samtools version compatibility: need .bam extension cmd = 'samtools view -bS assembly/reads2contigs.sam | samtools sort -o assembly/reads2contigs.bam' hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools index assembly/reads2contigs.bam' hp.run_cmd(cmd, args.verbose, 0) cmd = 'rm assembly/reads2contigs.sam' hp.run_cmd(cmd, args.verbose, 0) # BAM index stats cmd = 'samtools idxstats assembly/reads2contigs.bam > assembly/reads2contigs.stats.txt' hp.run_cmd(cmd, args.verbose, 0) # mpileup cmd = 'samtools mpileup -A -B -d 100000 -L 100000 -f assembly/contigs_trinity.fasta assembly/reads2contigs.bam > assembly/reads2contigs.pileup' hp.run_cmd(cmd, args.verbose, 0) # format pileup file - i.e., add zeros to uncovered positions ahp.formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup', 'assembly/reads2contigs.entropy') if not int(args.noclean): cmd = 'rm -r assembly/ref_remap' hp.run_cmd(cmd, args.verbose, 0) cmd = 'rm assembly/reads2contigs.pileup' hp.run_cmd(cmd, args.verbose, 0) hp.echostep('remap', start=0)
def hostsep(args): """Separate host reads""" # flags for STAR starflag = '' # pipe into gunzip gunzip_pipe = '' # if input files are gzipped if args.gzip: starflag = '--readFilesCommand zcat' gunzip_pipe = 'gunzip |' print('Counting input reads') cmd = 'cat {args.mate1} | {gunzip_pipe} wc -l | tr "\n" " " > {args.outputdir}/mapping_percent.txt'.format( args=args, gunzip_pipe=gunzip_pipe) hp.run_cmd(cmd, args.verbose, 0) cmd = 'echo {args.mate1} >> {args.outputdir}/mapping_percent.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) print('STAR mapping commenced') # Ioan: STAR option --outFilterMultimapNmax 1 to only output alignments if a read uniquely maps to reference; # HERE: allow up to 10 multi-hits reported --> will not affect feature counts downstream, nor Pandora results (b/c human multi-mapping here) # This option should not modify the downstream counts (of genes) with featureCounts, # which only counts features that are uniquely mapped (per BAM input marking info) if (args.single): cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format( args=args, starflag=starflag) else: cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} {args.mate2} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format( args=args, starflag=starflag) hp.run_cmd(cmd, args.verbose, 0) print('STAR mapping finished') print('find unmapped reads') cmd = 'samtools flagstat {args.outputdir}/Aligned.out.bam > {args.outputdir}/mapping_stats.STAR.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) # bin(13) = '0b1101', which corresponds to SAM flag bits: # read paired; read unmapped; mate unmapped if (args.single): ## Flag for unmapped single paired reads is 4 ## Samtools version compatibility issues: -o flag for output and .bam need to be specified cmd = 'samtools view -b -f 4 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format( args=args) else: cmd = 'samtools view -b -f 13 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/star_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/star_unmapped {args.single}'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'wc -l {args.outputdir}/star_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) print('Bowtie2 mapping commenced') if (args.single): cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -U {args.outputdir}/star_unmapped_1.fastq -S {args.outputdir}/bwt2.sam'.format( args=args) else: cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -1 {args.outputdir}/star_unmapped_1.fastq -2 {args.outputdir}/star_unmapped_2.fastq -S {args.outputdir}/bwt2.sam'.format( args=args) # hp.run_cmd(cmd, args.verbose, 0) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('Bowtie2 mapping finished') cmd = 'samtools flagstat {args.outputdir}/bwt2.sam > {args.outputdir}/mapping_stats.bwt.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) print('find unmapped reads') ## Samtools version compatibility issues: -o flag for output and .bam need to be specified if (args.single): cmd = 'samtools view -S -b -f 4 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format( args=args) else: cmd = 'samtools view -S -b -f 13 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/bwt2_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/bwt2_unmapped {args.single}'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'wc -l {args.outputdir}/bwt2_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) ## rename and zip both mates, or single mate if unpaired reads ## Ioan: filter short reads for i in ['1', '2']: if i == '1' or not (args.single): # Ioan found Trinity chokes if read length <= jellyfish kmer of 25 hp.fastqfilter( '{args.outputdir}/bwt2_unmapped_{i}.fastq'.format(args=args, i=i), '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), args.readlenfilter) ## zipping the files cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) # if gtf variable set, get gene coverage if args.gtf: print('featureCounts commenced') cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.outputdir}/Aligned.out.bam'.format( args=args) # hp.run_cmd(cmd, args.verbose, 0) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('featureCounts finished') # TO DO: make this code more compact if not args.noclean: print('clean up') cmd = 'rm -rf {args.outputdir}/_STARtmp'.format(args=args) hp.run_cmd(cmd, args.verbose, 0) for i in [ 'Aligned.out.bam', 'Log.*', 'SJ.out.tab', 'star_unmapped.bam', 'star_unmapped_*.fastq', 'bwt2.sam', 'bwt2_unmapped.bam' ]: cmd = 'rm {args.outputdir}/{i}'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0)
def getunmapped(args): """Starting with a .bam file, get the unmapped reads""" # fix violations of DRY (modify args variable) cmd = 'samtools flagstat {args.bam} > {args.outputdir}/mapping_stats.STAR.txt'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) print('find unmapped reads') if (args.single): cmd = 'samtools view -b -f 4 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format( args=args) else: cmd = 'samtools view -b -f 13 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) cmd = 'samtools view {args.outputdir}/unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/tmp_unmapped {args.single}'.format( args=args) hp.run_cmd(cmd, args.verbose, 0) ## filter short reads for i in ['1', '2']: if i == '1' or not (args.single): hp.fastqfilter( '{args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args, i=i), '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), args.readlenfilter) cmd = 'rm {args.outputdir}/tmp_unmapped_{i}.fastq'.format( args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) # if gtf variable set, get gene coverage if args.gtf: print('featureCounts commenced') cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.bam}'.format( args=args) hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog) print('featureCounts finished') # check output not empty, then zip both mates (or single file if unpaired reads) for i in ['1', '2']: if i == '1' or not (args.single): # check output not empty cmd = 'head {args.outputdir}/unmapped_{i}.fastq | wc -l'.format( args=args, i=i) numlines = hp.run_cmd(cmd, args.verbose, 1) if numlines == '0': print('[WARNING] No unmapped reads. Exiting') sys.exit(0) # zip cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i) hp.run_cmd(cmd, args.verbose, 0) if not args.noclean: print('clean up') cmd = 'rm -rf ' + args.outputdir + '/' + 'unmapped.bam' hp.run_cmd(cmd, args.verbose, 0) hp.echostep(args.step, start=0)