Ejemplo n.º 1
0
def assembly(args):
    """Do Trinity assembly"""

    hp.echostep(args.step)

    # print args
    print(args)
    print

    # mkdir -p assembly_trinity
    hp.mkdirp(args.outputdir)

    # perform Trinity assembly
    if (args.single):
        cmd = 'Trinity --seqType fq --normalize_reads --min_contig_length={args.trinitythreshold} --max_memory {args.trinitymem}G --CPU {args.trinitycores} --output {args.outputdir} --single {args.mate1}'.format(args=args)
    else:
        cmd = 'Trinity --seqType fq --normalize_reads --min_contig_length={args.trinitythreshold} --max_memory {args.trinitymem}G --CPU {args.trinitycores} --output {args.outputdir} --left {args.mate1} --right {args.mate2}'.format(args=args)
    # use run_long_cmd for programs with verbose output
    hp.run_long_cmd(cmd, args.verbose, 'log.Trinity')

    print('Trinity complete')

    # name of expected output file
    myoutput = args.outputdir + '/Trinity.fasta'

    # exit if no output
    hp.check_file_exists_and_nonzero(myoutput)

    # mkdir -p
    hp.mkdirp('assembly')

    # rename Trinity contigs, join sequence portion of fasta, return number of contigs
    # cat ${outputdir}/Trinity.fasta | awk 'BEGIN{f=0; counter=1}{if ($0~/^>/) {if (f) {printf "\n"; counter++}; print ">contig_"counter; f=1} else printf $0}END{printf "\n"}' > ${output}
    myoutput2 = 'assembly/contigs_trinity.fasta'
    num_contigs = hp.fastajoinlines(myoutput, myoutput2, 'contig')

    # compute simple distribution
    # cat assembly/contigs_trinity.fasta | paste - - | awk '{print length($2)}' | sort -nr | ${d}/scripts/tablecount | awk -v tot=${num_contigs} 'BEGIN{x=0}{x+=$2; print $1"\t"$2"\t"x"/"tot"\t"int(100*x/tot)"%"}' > assembly/contigs.distrib.txt
    ahp.computedistrib(myoutput2, 'assembly/contigs.distrib.txt')

    if not int(args.noclean):
        cmd = 'rm -rf assembly_trinity'
        hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)

    # return the name of assembly file
    return myoutput2
Ejemplo n.º 2
0
def blastnp(args):
    """Blastn or blastp"""

    hp.echostep(args.step)

    # extra blast flags
    flag = ''

    # if blastp, use blastp-fast
    if args.whichblast == 'blastp':
        flag = '-task blastp-fast'

    # do blastn or blastp
    cmd = '{args.whichblast} -outfmt "6 {args.fmt}" -query {args.input} -db {args.db} -num_threads {args.threads} {flag} > {args.outputdir}/blast_{args.sgeid}.result'.format(args=args, flag=flag)
    hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)
Ejemplo n.º 3
0
def discovery(args):
    """ORF Discovery"""

    hp.echostep(args.step)

    # print args
    print(args)
    print

    # mkdir -p 
    hp.mkdirp(args.outputdir)

    # find ORFs
    hp.getorf(args.input, args.outputdir + '/orf.fa', args.threshold)

    # check if output exists
    hp.check_file_exists_and_nonzero(args.outputdir + '/orf.fa')

    # if blast discovered ORFs
    if args.blast:

        # make directory
        hp.mkdirp(args.outputdir + '/blast')

        # define command: blastp to nr, if blast flag
        cmd = '{}/scripts/blast_wrapper.py --scripts {} --outputdir {} -i {} --logsdir {} --whichblast {} --threshold {} --db {} --id {} --noclean {}'.format(
                  args.scripts,
                  args.scripts,
                  args.outputdir  + '/blast',
                  args.outputdir + '/orf.fa',
                  'logs_blast2',
                  'blastp',
                  100,
                  args.db,
                  args.id,
                  args.noclean,
        )
        hp.run_cmd(cmd, args.verbose, 1)

    hp.echostep(args.step, start=0)
Ejemplo n.º 4
0
def makerep(args):
    """
    Make report
    """

    # The goal of this function is to filter blast results based on taxid
    # Don't want human sequences or anything in the taxid blacklist,
    # which contains non-pathogens

    hp.echostep(args.step)

    # print args
    print(args)
    print

    # mkdir -p
    hp.mkdirp(args.outputdir)

    # defaults
    # taxid corresponding to h**o sapiens
    humantaxid = '9606'
    # desired header
    desiredfields = ['qseqid', 'sseqid', 'qlen','saccver','staxids','evalue', 'bitscore', 'stitle']

    # taxid blacklist
    filterlist = set()
    # header
    header = []
    # id 2 #reads dict (output of samtools idxstats)
    idx = {}
    # a dictionary to map taxon to sum of uniq reads, longest contig id, longest contig length, number of contigs
    taxonstats = collections.defaultdict(dict)

    # load blacklist if supplied
    if args.blacklist:
        with open(args.blacklist, 'r') as f:
            # final newline causes empty list elt
            filterlist = set([i for i in f.read().split('\n') if i])

    # load idx file
    try:
        with open(args.id2reads, 'r') as f:
            for line in f:
                # map id to #reads
                idx[line.split()[0].strip()] = line.split()[2].strip()
    except:
        print('[Warning] Failed to load id2reads file')

    # load header
    with open(args.header, 'r') as f:
        header = map(str.strip, f.read().split())

    # get indicies,fields in file
    myindicies = []
    myfields = []
    for j,k in enumerate(header):
        if k in desiredfields:
            myindicies.append(j)
            myfields.append(k)

    # get index of taxid, qseqid:
    taxidindex = myfields.index('staxids')
    qseqidindex = myfields.index('qseqid')
    qlenindex = myfields.index('qlen')

    # list of input files
    myfiles = [args.input]
    # if there's a valid file produced by the ORF discovery step, examine that file also
    if hp.check_path_bool(args.input2):
        myfiles.append(args.input2)

    # write file keyed on contig
    with open(args.outputdir + '/blast.topfilter.unsort.txt', 'w') as f:
        # print header:
        f.write('sampleid\t' + '\t'.join(myfields) + '\tnum_reads\n')
        # loop through inputs (blast, ORF discovery)
        for myfile in myfiles:
            with open(myfile, 'r') as g:
                for line in g:
                    # don't want lines with predicted (as opposed to real) genes
                    if 'PREDICTED' in line:
                        continue
                    # get desired fields
                    fields = [line.split('\t')[i].strip() for i in myindicies]
                    taxid = fields[taxidindex]
                    qseqid = fields[qseqidindex]
                    qlen = fields[qlenindex]
                    # bypass human taxids or specifically filtered taxids
                    if taxid == humantaxid or taxid in filterlist:
                        continue
                    # multiple taxids not supported
                    if ';' in taxid:
                        print('[WARNING] semicolon detected: multiple taxids not supported')
                    # get read counts
                    readcounts = idx.get(qseqid, '-')
                    # write to file
                    f.write(args.id + '\t' + '\t'.join(fields) + '\t' + readcounts + '\n')
                    # set taxonstats dictionary
                    taxonstats[taxid]['num'] = taxonstats[taxid].get('num', 0) + 1
                    taxonstats[taxid]['longest'] = qseqid
                    taxonstats[taxid]['longestlength'] = max(taxonstats[taxid].get('longestlength', 0), int(qlen))
                    if not readcounts == '-':
                        taxonstats[taxid]['sum'] = taxonstats[taxid].get('sum', 0) + int(readcounts)
                    else:
                        taxonstats[taxid]['sum'] = '-'

    # sort by staxids then qlen (with bash)
    # careful: you're including the header in the file (make sure it's sorted properly)
    cmd = 'sort -k5,5n -k6,6nr {args.outputdir}/blast.topfilter.unsort.txt > {args.outputdir}/{args.contigreport}'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    # get num reads to host
    num_reads_to_host = get_reads_mapped_to_host(args)
    if args.verbose:
        print('Number of reads properly mapped to host: {reads}'.format(reads=num_reads_to_host))
#       print(dict(taxonstats))

    # write file keyed on taxon
    with open(args.outputdir + '/' + args.taxonreport, 'w') as f:
        # print header:
        f.write('sampleid\ttaxid\tsum_reads\tnum_contigs\tid_longest_contig\tlen_longest_contig\tpathogen_reads/(host_reads/10^6)\n')
        for taxid in taxonstats:
            # pathogen reads / host reads
	    sumdivhost = '-'
            # if num_reads_to_host nonzero
            if num_reads_to_host:
                sumdivhost = str(round(1000000*taxonstats[taxid]['sum']/float(num_reads_to_host),2))
            mytaxonattributes = [str(taxonstats[taxid]['sum']),
                                 str(taxonstats[taxid]['num']),
                                 taxonstats[taxid]['longest'],
                                 str(taxonstats[taxid]['longestlength']),
                                 sumdivhost]
            f.write(args.id + '\t' + taxid + '\t' + '\t'.join(mytaxonattributes) + '\n')
    
    # generate and write html report
    if args.taxid2names != 'None':
	try:
            # makeHTML.generateHTML(args.outputdir + '/' + args.taxonreport, args.scripts, args.taxid2names, args.outputdir, args.hpc)
            makeHTML.generateHTML(args.outputdir + '/' + args.taxonreport, args.taxid2names, args.outputdir)
        except:
            print('[ERROR] makeHTML failed.')
    else: 
        print('[WARNING] missing names.dmp, the file mapping taxids to names. HTML report will not be generated.')
    
#    if args.verbose:
#        print(dict(taxonstats))

    if not args.noclean:
        os.remove(args.outputdir + '/blast.topfilter.unsort.txt')

    hp.echostep(args.step, start=0)
Ejemplo n.º 5
0
def hostsep(args):
    """Separate host reads"""

    # flags for STAR
    starflag=''
    # pipe into gunzip
    gunzip_pipe = ''
    # if input files are gzipped
    if args.gzip: 
        starflag='--readFilesCommand zcat'
        gunzip_pipe = 'gunzip |'

    print('Counting input reads')
    cmd = 'cat {args.mate1} | {gunzip_pipe} wc -l | tr "\n" " " > {args.outputdir}/mapping_percent.txt'.format(args=args, gunzip_pipe=gunzip_pipe)
    hp.run_cmd(cmd, args.verbose, 0)
    cmd = 'echo {args.mate1} >> {args.outputdir}/mapping_percent.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('STAR mapping commenced')

    # Ioan: STAR option --outFilterMultimapNmax 1 to only output alignments if a read uniquely maps to reference;
    # HERE: allow up to 10 multi-hits reported --> will not affect feature counts downstream, nor Pandora results (b/c human multi-mapping here)
    # This option should not modify the downstream counts (of genes) with featureCounts,
    # which only counts features that are uniquely mapped (per BAM input marking info)
    if (args.single):
        cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(args=args, starflag=starflag)
    else:
        cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} {args.mate2} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(args=args, starflag=starflag)

    hp.run_cmd(cmd, args.verbose, 0)

    print('STAR mapping finished')

    print('find unmapped reads')

    cmd = 'samtools flagstat {args.outputdir}/Aligned.out.bam > {args.outputdir}/mapping_stats.STAR.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    # bin(13) = '0b1101', which corresponds to SAM flag bits:
    # read paired; read unmapped; mate unmapped
    if (args.single):
        ## Flag for unmapped single paired reads is 4
        ## Samtools version compatibility issues: -o flag for output and .bam need to be specified
        cmd = 'samtools view -b -f 4 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(args=args)
    else:
        cmd = 'samtools view -b -f 13 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/star_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/star_unmapped {args.single}'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'wc -l {args.outputdir}/star_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('Bowtie2 mapping commenced')

    if (args.single):
        cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -U {args.outputdir}/star_unmapped_1.fastq -S {args.outputdir}/bwt2.sam'.format(args=args)
    else:
        cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -1 {args.outputdir}/star_unmapped_1.fastq -2 {args.outputdir}/star_unmapped_2.fastq -S {args.outputdir}/bwt2.sam'.format(args=args)

    # hp.run_cmd(cmd, args.verbose, 0)
    hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)

    print('Bowtie2 mapping finished')

    cmd = 'samtools flagstat {args.outputdir}/bwt2.sam > {args.outputdir}/mapping_stats.bwt.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('find unmapped reads')

    ## Samtools version compatibility issues: -o flag for output and .bam need to be specified
    if (args.single):
        cmd = 'samtools view -S -b -f 4 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(args=args)
    else:
        cmd = 'samtools view -S -b -f 13 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/bwt2_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/bwt2_unmapped {args.single}'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'wc -l {args.outputdir}/bwt2_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    ## rename and zip both mates, or single mate if unpaired reads
    ## Ioan: filter short reads
    for i in ['1', '2']:
        if i=='1' or not (args.single):
            # Ioan found Trinity chokes if read length <= jellyfish kmer of 25
            hp.fastqfilter(
                '{args.outputdir}/bwt2_unmapped_{i}.fastq'.format(args=args, i=i), 
                '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), 
                args.readlenfilter
            )
            ## zipping the files
            cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    # if gtf variable set, get gene coverage
    if args.gtf:
        print('featureCounts commenced')
        cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.outputdir}/Aligned.out.bam'.format(args=args)
        # hp.run_cmd(cmd, args.verbose, 0)
        hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)
        print('featureCounts finished')

    # TO DO: make this code more compact
    if not args.noclean:
        print('clean up')
        cmd = 'rm -rf {args.outputdir}/_STARtmp'.format(args=args)
        hp.run_cmd(cmd, args.verbose, 0)
        for i in ['Aligned.out.bam', 'Log.*', 'SJ.out.tab', 'star_unmapped.bam', 'star_unmapped_*.fastq', 'bwt2.sam', 'bwt2_unmapped.bam']:
            cmd = 'rm {args.outputdir}/{i}'.format(args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)
Ejemplo n.º 6
0
def getunmapped(args):
    """Starting with a .bam file, get the unmapped reads"""

    # fix violations of DRY (modify args variable)

    cmd = 'samtools flagstat {args.bam} > {args.outputdir}/mapping_stats.STAR.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('find unmapped reads')

    if (args.single):
        cmd = 'samtools view -b -f 4 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(args=args)
    else:
        cmd = 'samtools view -b -f 13 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/tmp_unmapped {args.single}'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    ## filter short reads
    for i in ['1', '2']:
        if i=='1' or not (args.single):
            hp.fastqfilter(
                '{args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args, i=i), 
                '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i), 
                args.readlenfilter
            )

            cmd = 'rm {args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    # if gtf variable set, get gene coverage
    if args.gtf:
        print('featureCounts commenced')
        cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.bam}'.format(args=args)
        hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)
        print('featureCounts finished')

    # check output not empty, then zip both mates (or single file if unpaired reads)
    for i in ['1', '2']:
        if i=='1' or not (args.single):
            # check output not empty
            cmd = 'head {args.outputdir}/unmapped_{i}.fastq | wc -l'.format(args=args, i=i)
            numlines = hp.run_cmd(cmd, args.verbose, 1)
            if numlines == '0':
                print('[WARNING] No unmapped reads. Exiting')
                sys.exit(0)
            # zip
            cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    if not args.noclean:
        print('clean up')
        cmd = 'rm -rf ' + args.outputdir + '/' + 'unmapped.bam'
        hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)
Ejemplo n.º 7
0
def blast(args):
    """Do blast in parallel"""

    hp.echostep(args.step)

    # print args
    print(args)
    print

    # mkdir -p 
    hp.mkdirp(args.outputdir)

    args.noclean = int(args.noclean)

    # generate header
    with open(args.outputdir + '/header', 'w') as f:
        f.write(args.fmt.replace(' ', '\t') + '\n')

    # if no qsub
    if args.nosge:
        # filter fasta file on contigs above threshold length and hardcode name for blast.py
        # (splitting files doesnt make sense if no cluster)
        filecount = hp.fastafilter(args.input, args.outputdir + '/blast_1.fasta', args.threshold)

        if filecount == 0:
            print("No contigs above threshold. Exiting")
            sys.exit(1)

        # define log files
        logs_out = args.outputdir + '/' + 'blast_log_' + args.id + '.o'
        logs_err = args.outputdir + '/' + 'blast_log_' + args.id + '.e'
        # define command
        cmd = '{args.scripts}/scripts/blast.py --scripts {args.scripts} --outputdir {args.outputdir} --whichblast {args.whichblast} --db {args.db} --threads {args.threads} --fmt "{args.fmt}" --sgeid 1 > {o} 2> {e}'.format(args=args, o=logs_out, e=logs_err)
        hp.run_cmd(cmd, args.verbose, 0)

        # make link
        #cmd = 'ln -s blast_1.result {args.outputdir}/concat.txt'.format(args=args)
        #hp.run_cmd(cmd, args.verbose, 0)
        # get top hits
        #hp.tophitsfilter(args.outputdir + '/blast_1.result', args.outputdir + '/top.concat.txt')
        # get fasta file of entries that didn't blast
        #hp.getnohits(args.outputdir + '/top.concat.txt', args.outputdir + '/blast_1.fasta', args.outputdir + '/no_blastn.fa')

        # now filter blast results
        concat(args)

    # if qsub
    else:
        # mkdir -p
        hp.mkdirp(args.logsdir)

        # split fasta file on contigs above threshold length (and return number of contigs, file count)
        (numcontigs, filecount) = hp.fastasplit2(args.input, args.outputdir + '/blast', args.threshold, args.filelength)

        if filecount == 0:
            print("No contigs above threshold. Exiting")
            sys.exit(1)
        else:
            print("There are " + str(numcontigs) + " contigs above threshold, and " + str(filecount) + " files to blast.")

        # qsub part of command (array job)
        qcmd = 'qsub -S {mypython} -N bc_{args.id} -e {args.logsdir} -o {args.logsdir} -l mem={args.bmem}G,time={args.btime}:: -t 1-{filecount} '.format(mypython=sys.executable, args=args, filecount=filecount)
        #if args.hpc:
        #    qcmd += ...
        # regular part of command
        cmd = '{args.scripts}/scripts/blast.py --scripts {args.scripts} --outputdir {args.outputdir} --whichblast {args.whichblast} --db {args.db} --threads {args.threads} --fmt "{args.fmt}"'.format(args=args)
        if args.verbose:
            print(qcmd + cmd)
        message = subprocess.check_output(qcmd + cmd, shell=True)
        print(message)
        # get job id
        jid = hp.getjid(message)

        # hold the script up here, until all the blast jobs finish
        # concat top blast hits; concat log files into one, so as not to clutter the file system
        # qsub part of command
        qcmd = 'qsub -V -b y -cwd -o log.out -e log.err -N wait_{args.id} -hold_jid {jid} -sync y echo wait_here'.format(args=args, jid=jid)
        message = subprocess.check_output(qcmd, shell=True)
        print(message)

        # now concatenate and filter blast results
        concat(args)

    hp.echostep(args.step, start=0)
Ejemplo n.º 8
0
def concat(args):
    """
    Concatenate blast files and logs, so as not to leave many files messily scattered about.
    Also, implement Ioan's filtering
    """

    print('CONCATENATE START')

    # define commands
    # file of all blast hits
    cmd = 'cat {args.outputdir}/*.result > {args.outputdir}/concat.txt'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)
    # all fasta entries
    cmd = 'cat {args.outputdir}/*.fasta > {args.outputdir}/above_threshold.fa'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    # set of all input IDs from the concatenated fasta file
    with open(args.outputdir + '/above_threshold.fa', 'r') as g:
        allids = {i[1:] for i in g.read().split('\n') if i and i[0] == '>'}

    # set of IDs seen so far
    seenids = set()

    # file of top blast hits
    tophitsfile = open(args.outputdir + '/top.concat.txt', 'w')
    # file filtered with Ioan's prescription
    ifilterfile = open(args.outputdir + '/ifilter.concat.txt', 'w')

    # Ioan: Top number of BLAST hits to parse through in order to determine whether top hit can be trusted as truly non-human
    topchunk = 10
    # a counter
    minicounter = 0
    # a line representing a top hit
    topline = None
    # a filtering boolean (if true, filter out line)
    filterbool = False

    # glob blast files
    myfiles = glob.glob(args.outputdir + '/*.result')
    f = fileinput.input(files=myfiles)
    for line in f:
        linelist = line.strip().split()
        myid = linelist[0]
        # ID not yet seen (i.e., is top hit)
        if not myid in seenids:
            tophitsfile.write(line)
            seenids.add(myid)

            # this will skip on the loop's first iteration
            if topline:
                # print previous top line
                if not filterbool:
                    ifilterfile.write(topline + '\n')

            # here we're assuming fmt is:
            # qseqid, sseqid, saccver, staxids, pident, nident, length, mismatch, gapopen, gaps, qstart, qend, qlen, qframe, qcovs, sstart, send, slen, sframe, sstrand, evalue, bitscore, stitle
            topbitscore = float(linelist[21])
            topstaxids = linelist[3]
            topline = line.strip()

            # reset counter
            minicounter = 0
            # reset boolean (if tophit human, preemptively filter)
            if topstaxids == humantaxid:
                filterbool = True
            else:
                filterbool = False
            #print(myid + ' ' + str(minicounter) + ' ' + str(filterbool))
        # keep on checking results if filter flag hasn't gone high and #lines < topchunk
        elif (not filterbool) and minicounter < topchunk:
            # here we're assuming fmt is:
            # qseqid, sseqid, saccver, staxids, pident, nident, length, mismatch, gapopen, gaps, qstart, qend, qlen, qframe, qcovs, sstart, send, slen, sframe, sstrand, evalue, bitscore, stitle
            staxids = linelist[3]
            evalue = float(linelist[20])
            bitscore = float(linelist[21])
            filterbool = ioanfilter(staxids, evalue, bitscore, topbitscore)
            minicounter += 1
            #print(myid + ' ' + str(minicounter) + ' ' + str(filterbool))

    # do last entry
    if not filterbool:
        ifilterfile.write(topline)

    f.close()
    tophitsfile.close()
    ifilterfile.close()

    # set of IDs that didn't blast
    # print(allids)
    # print(seenids)
    noblastids = allids - seenids

    # get fasta file of entries that didn't blast
    filecount = hp.fastaidfilter(args.outputdir + '/above_threshold.fa', args.outputdir + '/no_blastn.fa', noblastids)

    if not args.noclean:
        cmd = 'rm {args.outputdir}/*.result {args.outputdir}/*.fasta'.format(args=args)
        hp.run_cmd(cmd, args.verbose, 0)

    print('No blast hits for: ' + ', '.join(list(noblastids)))

    # concat blast logs and remove folder
    print('concatenate blast logs')
    cmd = 'head -100 {args.logsdir}/* > {args.outputdir}/log.blast'.format(args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    if not args.noclean:
        shutil.rmtree(args.logsdir)

    print('CONCATENATE END')
Ejemplo n.º 9
0
def remap(args, contigs):
    """map contigs back onto assembly"""

    hp.echostep('remap')

    hp.mkdirp('assembly/ref_remap')

    refbowtie="assembly/ref_remap/ref"

    cmd = 'bowtie2-build {} {}'.format(contigs, refbowtie)
    hp.run_cmd(cmd, args.verbose, 0)

    if (args.single):
        cmd = 'bowtie2 -p 4 -x {} -U {} -S {}'.format(refbowtie, args.mate1, 'assembly/reads2contigs.sam')
    else:
        cmd = 'bowtie2 -p 4 -x {} -1 {} -2 {} -S {}'.format(refbowtie, args.mate1, args.mate2, 'assembly/reads2contigs.sam')
    hp.run_cmd(cmd, args.verbose, 0)

    # convert to bam
    ## samtools version compatibility: need .bam extension
    cmd = 'samtools view -bS assembly/reads2contigs.sam | samtools sort -o assembly/reads2contigs.bam'
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools index assembly/reads2contigs.bam'
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'rm assembly/reads2contigs.sam'
    hp.run_cmd(cmd, args.verbose, 0)

    # BAM index stats
    cmd = 'samtools idxstats assembly/reads2contigs.bam > assembly/reads2contigs.stats.txt'
    hp.run_cmd(cmd, args.verbose, 0)

    # mpileup
    cmd = 'samtools mpileup -A -B -d 100000 -L 100000 -f assembly/contigs_trinity.fasta assembly/reads2contigs.bam > assembly/reads2contigs.pileup'
    hp.run_cmd(cmd, args.verbose, 0)

    # format pileup file - i.e., add zeros to uncovered positions
    ahp.formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup', 'assembly/reads2contigs.entropy')

    if not int(args.noclean):
        cmd = 'rm -r assembly/ref_remap'
        hp.run_cmd(cmd, args.verbose, 0)
        cmd = 'rm assembly/reads2contigs.pileup'
        hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep('remap', start=0)
Ejemplo n.º 10
0
def hostsep(args):
    """Separate host reads"""

    # flags for STAR
    starflag = ''
    # pipe into gunzip
    gunzip_pipe = ''
    # if input files are gzipped
    if args.gzip:
        starflag = '--readFilesCommand zcat'
        gunzip_pipe = 'gunzip |'

    print('Counting input reads')
    cmd = 'cat {args.mate1} | {gunzip_pipe} wc -l | tr "\n" " " > {args.outputdir}/mapping_percent.txt'.format(
        args=args, gunzip_pipe=gunzip_pipe)
    hp.run_cmd(cmd, args.verbose, 0)
    cmd = 'echo {args.mate1} >> {args.outputdir}/mapping_percent.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('STAR mapping commenced')

    # Ioan: STAR option --outFilterMultimapNmax 1 to only output alignments if a read uniquely maps to reference;
    # HERE: allow up to 10 multi-hits reported --> will not affect feature counts downstream, nor Pandora results (b/c human multi-mapping here)
    # This option should not modify the downstream counts (of genes) with featureCounts,
    # which only counts features that are uniquely mapped (per BAM input marking info)
    if (args.single):
        cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(
            args=args, starflag=starflag)
    else:
        cmd = 'STAR --runThreadN {args.threads} --genomeDir {args.refstar} --readFilesIn {args.mate1} {args.mate2} --outFileNamePrefix {args.outputdir}/ --outSAMtype BAM Unsorted --outFilterMultimapNmax 10 --outSAMunmapped Within {starflag}'.format(
            args=args, starflag=starflag)

    hp.run_cmd(cmd, args.verbose, 0)

    print('STAR mapping finished')

    print('find unmapped reads')

    cmd = 'samtools flagstat {args.outputdir}/Aligned.out.bam > {args.outputdir}/mapping_stats.STAR.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    # bin(13) = '0b1101', which corresponds to SAM flag bits:
    # read paired; read unmapped; mate unmapped
    if (args.single):
        ## Flag for unmapped single paired reads is 4
        ## Samtools version compatibility issues: -o flag for output and .bam need to be specified
        cmd = 'samtools view -b -f 4 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(
            args=args)
    else:
        cmd = 'samtools view -b -f 13 {args.outputdir}/Aligned.out.bam | samtools sort -n -o {args.outputdir}/star_unmapped.bam'.format(
            args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/star_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/star_unmapped {args.single}'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'wc -l {args.outputdir}/star_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('Bowtie2 mapping commenced')

    if (args.single):
        cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -U {args.outputdir}/star_unmapped_1.fastq -S {args.outputdir}/bwt2.sam'.format(
            args=args)
    else:
        cmd = 'bowtie2 -p {args.threads} -x {args.refbowtie} -1 {args.outputdir}/star_unmapped_1.fastq -2 {args.outputdir}/star_unmapped_2.fastq -S {args.outputdir}/bwt2.sam'.format(
            args=args)

    # hp.run_cmd(cmd, args.verbose, 0)
    hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)

    print('Bowtie2 mapping finished')

    cmd = 'samtools flagstat {args.outputdir}/bwt2.sam > {args.outputdir}/mapping_stats.bwt.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('find unmapped reads')

    ## Samtools version compatibility issues: -o flag for output and .bam need to be specified
    if (args.single):
        cmd = 'samtools view -S -b -f 4 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(
            args=args)
    else:
        cmd = 'samtools view -S -b -f 13 {args.outputdir}/bwt2.sam | samtools sort -n -o {args.outputdir}/bwt2_unmapped.bam'.format(
            args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/bwt2_unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/bwt2_unmapped {args.single}'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'wc -l {args.outputdir}/bwt2_unmapped_1.fastq >> {args.outputdir}/mapping_percent.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    ## rename and zip both mates, or single mate if unpaired reads
    ## Ioan: filter short reads
    for i in ['1', '2']:
        if i == '1' or not (args.single):
            # Ioan found Trinity chokes if read length <= jellyfish kmer of 25
            hp.fastqfilter(
                '{args.outputdir}/bwt2_unmapped_{i}.fastq'.format(args=args,
                                                                  i=i),
                '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i),
                args.readlenfilter)
            ## zipping the files
            cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args,
                                                                    i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    # if gtf variable set, get gene coverage
    if args.gtf:
        print('featureCounts commenced')
        cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.outputdir}/Aligned.out.bam'.format(
            args=args)
        # hp.run_cmd(cmd, args.verbose, 0)
        hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)
        print('featureCounts finished')

    # TO DO: make this code more compact
    if not args.noclean:
        print('clean up')
        cmd = 'rm -rf {args.outputdir}/_STARtmp'.format(args=args)
        hp.run_cmd(cmd, args.verbose, 0)
        for i in [
                'Aligned.out.bam', 'Log.*', 'SJ.out.tab', 'star_unmapped.bam',
                'star_unmapped_*.fastq', 'bwt2.sam', 'bwt2_unmapped.bam'
        ]:
            cmd = 'rm {args.outputdir}/{i}'.format(args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)
Ejemplo n.º 11
0
def getunmapped(args):
    """Starting with a .bam file, get the unmapped reads"""

    # fix violations of DRY (modify args variable)

    cmd = 'samtools flagstat {args.bam} > {args.outputdir}/mapping_stats.STAR.txt'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    print('find unmapped reads')

    if (args.single):
        cmd = 'samtools view -b -f 4 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(
            args=args)
    else:
        cmd = 'samtools view -b -f 13 {args.bam} | samtools sort -n -o {args.outputdir}/unmapped.bam'.format(
            args=args)

    hp.run_cmd(cmd, args.verbose, 0)

    cmd = 'samtools view {args.outputdir}/unmapped.bam | {args.scripts}/scripts/sam2fastq.py {args.outputdir}/tmp_unmapped {args.single}'.format(
        args=args)
    hp.run_cmd(cmd, args.verbose, 0)

    ## filter short reads
    for i in ['1', '2']:
        if i == '1' or not (args.single):
            hp.fastqfilter(
                '{args.outputdir}/tmp_unmapped_{i}.fastq'.format(args=args,
                                                                 i=i),
                '{args.outputdir}/unmapped_{i}.fastq'.format(args=args, i=i),
                args.readlenfilter)

            cmd = 'rm {args.outputdir}/tmp_unmapped_{i}.fastq'.format(
                args=args, i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    # if gtf variable set, get gene coverage
    if args.gtf:
        print('featureCounts commenced')
        cmd = 'featureCounts -a {args.gtf} -o {args.outputdir}/host_gene_counts.txt {args.bam}'.format(
            args=args)
        hp.run_log_cmd(cmd, args.verbose, args.olog, args.elog)
        print('featureCounts finished')

    # check output not empty, then zip both mates (or single file if unpaired reads)
    for i in ['1', '2']:
        if i == '1' or not (args.single):
            # check output not empty
            cmd = 'head {args.outputdir}/unmapped_{i}.fastq | wc -l'.format(
                args=args, i=i)
            numlines = hp.run_cmd(cmd, args.verbose, 1)
            if numlines == '0':
                print('[WARNING] No unmapped reads. Exiting')
                sys.exit(0)
            # zip
            cmd = 'gzip {args.outputdir}/unmapped_{i}.fastq'.format(args=args,
                                                                    i=i)
            hp.run_cmd(cmd, args.verbose, 0)

    if not args.noclean:
        print('clean up')
        cmd = 'rm -rf ' + args.outputdir + '/' + 'unmapped.bam'
        hp.run_cmd(cmd, args.verbose, 0)

    hp.echostep(args.step, start=0)