Esempio n. 1
0
def runDiamond(input, query, cpus, output):
    #create DB of protein sequences
    cmd = [
        'diamond', 'makedb', '--threads',
        str(cpus), '--in', query, '--db', 'diamond'
    ]
    lib.runSubprocess4(cmd, output, lib.log)
    #now run search
    cmd = [
        'diamond', 'blastx', '--threads',
        str(cpus), '-q', input, '--db', 'diamond', '-o', 'diamond.matches.tab',
        '-e', '1e-10', '-k', '0', '--more-sensitive', '-f', '6', 'sseqid',
        'slen', 'sstart', 'send', 'qseqid', 'qlen', 'qstart', 'qend', 'pident',
        'length', 'evalue', 'score', 'qcovhsp', 'qframe'
    ]
    lib.runSubprocess4(cmd, output, lib.log)
Esempio n. 2
0
def removeAntiSense(input, readTuple, output):
    '''
    function will map reads to the input transcripts, determine strandedness, and then filter
    out transcripts that were assembled in antisense orientation. idea here is that the antisense
    transcripts, while potentially valid, aren't going to help update the gene models and perhaps
    could hurt the annotation effort?
    '''
    lib.log.info("Running anti-sense filtering of Trinity transcripts")
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    aligner = choose_aligner()
    if aligner == 'hisat2':
        bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Hisat2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'hisat2-build', input,
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

            #now launch the aligner
            lib.log.info("Aligning reads to trinity transcripts with Hisat2")
            hisat2cmd = [
                'hisat2', '-p',
                str(args.cpus), '-k', '50', '--max-intronlen',
                str(args.max_intronlen), '-x',
                os.path.join(tmpdir, 'hisat2.transcripts')
            ]
            if readTuple[2]:
                hisat2cmd = hisat2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                hisat2cmd = hisat2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(hisat2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'bowtie2':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir,
                                  'bowtie2.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building Bowtie2 index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'bowtie2-build', input,
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with Bowtie2")
            bowtie2cmd = [
                'bowtie2', '-p',
                str(args.cpus), '-k', '50', '--local', '--no-unal', '-x',
                os.path.join(tmpdir, 'bowtie2.transcripts')
            ]
            if readTuple[2]:
                bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]]
            if readTuple[0] and readTuple[1]:
                bowtie2cmd = bowtie2cmd + [
                    '-1', readTuple[0], '-2', readTuple[1]
                ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(bowtie2cmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess4(cmd, '.', lib.log)

    elif aligner == 'rapmap':
        #using bowtie2
        bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam')
        if not os.path.isfile(bowtie2bam):
            lib.log.info("Building RapMap index of " +
                         "{0:,}".format(lib.countfasta(input)) +
                         " trinity transcripts")
            cmd = [
                'rapmap', 'quasiindex', '-t', input, '-i',
                os.path.join(tmpdir, 'rapmap_index')
            ]
            lib.runSubprocess4(cmd, '.', lib.log)
            #now launch the subprocess commands in order
            lib.log.info("Aligning reads to trinity transcripts with RapMap")
            rapmapcmd = [
                'rapmap', 'quasimap', '-t',
                str(args.cpus), '-i',
                os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2',
                readTuple[1]
            ]
            cmd = [
                os.path.join(parentdir, 'util', 'sam2bam.sh'),
                " ".join(rapmapcmd),
                str(bamthreads), bowtie2bam
            ]
            lib.runSubprocess(cmd, '.', lib.log)

    #now run Trinity examine strandeness tool
    lib.log.info("Examining strand specificity")
    cmd = [
        os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'),
        bowtie2bam,
        os.path.join(tmpdir, 'strand_specific')
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #parse output dat file and get list of transcripts to remove
    removeList = []
    with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile:
        for line in infile:
            line = line.replace('\n', '')
            if line.startswith('#'):
                continue
            cols = line.split('\t')
            if args.stranded == 'RF':  #then we want to keep negative ratios in cols[4]
                if not cols[4].startswith('-'):
                    removeList.append(cols[0])
            elif args.stranded == 'FR':  #keep + values
                if cols[4].startswith('-'):
                    removeList.append(cols[0])

    #now parse the input fasta file removing records in list
    with open(output, 'w') as outfile:
        with open(input, 'rU') as infile:
            for record in SeqIO.parse(infile, 'fasta'):
                if not record.id in removeList:
                    outfile.write(">%s\n%s\n" %
                                  (record.description, str(record.seq)))
    lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
Esempio n. 3
0
def runTrinityGG(genome, readTuple, output):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    #build hisat2 index, using exons and splice sites
    lib.log.info("Starting Trinity genome guided")
    lib.log.info("Building Hisat2 genome index")
    cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')]
    lib.runSubprocess4(cmd, '.', lib.log)
    #align reads using hisat2
    lib.log.info("Aligning reads to genome using Hisat2")
    hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam')
    #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    if args.stranded != 'no' and not readTuple[2]:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness',
            args.stranded
        ]
    else:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome')
        ]
    if readTuple[0] and readTuple[1]:
        hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
    if readTuple[2]:
        hisat2cmd = hisat2cmd + ['-U', readTuple[2]]

    cmd = [
        os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd),
        str(bamthreads), hisat2bam
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    #now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no' and not readTuple[2]:
        cmd = [
            'Trinity', '--SS_lib_type', args.stranded,
            '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam,
            '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    else:
        cmd = [
            'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam',
            hisat2bam, '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    cmd = cmd + jaccard_clip
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'rU') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            line = line.replace(
                '--no_distributed_trinity_exec',
                '')  #don't think this should be appended to every command....
            line = line.replace('"', '')  #don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling " + "{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus - 1))
    lib.runMultiProgress(safe_run, file_list, args.cpus - 1)

    #collected output files and clean
    outputfiles = os.path.join(tmpdir, 'trinity_gg',
                               'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'),
                                   '*inity.fasta'):
            fileout.write('%s\n' % filename)
    #now grab them all using Trinity script
    cmd = [
        os.path.join(TRINITY, 'util', 'support_scripts',
                     'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG'
    ]
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)