Ejemplo n.º 1
0
def concatenateReads(input, output):
    '''
    Since I can't seem to get the comma separated lists to work with subprocess modules, just 
    concatenate FASTQ files in order and use a single file, input should be a list of FASTQ files
    using system cat here so that gzipped files are concatenated correctly
    '''
    cmd = ['cat']
    cmd = cmd + input
    lib.runSubprocess2(cmd, '.', lib.log, output)
Ejemplo n.º 2
0
def runAugustus(Input):
    if '_part' in Input:
        chr = Input.split('_part')[0]
    else:
        chr = Input
    species='--species='+args.species
    hints_input = '--hintsfile='+args.hints
    aug_out = os.path.join(tmpdir, Input+'.augustus.gff3')
    core_cmd = ['augustus', species, '--softmasking=1', '--gff3=on', '--UTR=off', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, chr+'.fa')]
    if args.hints:
        core_cmd.insert(2, extrinsic)
        core_cmd.insert(3, hints_input)
    if Input in ranges:
        start = ranges.get(Input)[0]
        end = ranges.get(Input)[1]
        core_cmd.insert(2, '--predictionStart='+str(start))
        core_cmd.insert(3, '--predictionEnd='+str(end))
    #try using library module
    lib.runSubprocess2(core_cmd, '.', lib.log, aug_out)
Ejemplo n.º 3
0
def runNormalization(readTuple, memory):
    '''
    function is wrapper for Trinity read normalization
    have to run normalization separately for PE versus single
    '''
    left_norm, right_norm, single_norm = (None, ) * 3
    SENormalLog = os.path.join(tmpdir, 'trinity_normalization.SE.log')
    PENormalLog = os.path.join(tmpdir, 'trinity_normalization.PE.log')
    if args.stranded != 'no':
        cmd = [
            os.path.join(TRINITY, 'util', 'insilico_read_normalization.pl'),
            '--PARALLEL_STATS', '--JM', memory, '--max_cov',
            str(args.coverage), '--seqType', 'fq', '--output',
            os.path.join(tmpdir, 'normalize'), '--CPU',
            str(args.cpus), '--SS_lib_type', args.stranded
        ]
    else:
        cmd = [
            os.path.join(TRINITY, 'util', 'insilico_read_normalization.pl'),
            '--PARALLEL_STATS', '--JM', memory, '--max_cov',
            str(args.coverage), '--seqType', 'fq', '--output',
            os.path.join(tmpdir, 'normalize'), '--CPU',
            str(args.cpus)
        ]
    if readTuple[
            2]:  #single reads present, so run normalization just on those reads
        cmd = cmd + ['--single', readTuple[2]]
        lib.runSubprocess2(cmd, '.', lib.log, SENormalLog)
        single_norm = os.path.join(tmpdir, 'normalize', 'single.norm.fq')
    if readTuple[0] and readTuple[1]:
        cmd = cmd + [
            '--pairs_together', '--left', readTuple[0], '--right', readTuple[1]
        ]
        left_norm = os.path.join(tmpdir, 'normalize', 'left.norm.fq')
        right_norm = os.path.join(tmpdir, 'normalize', 'right.norm.fq')
        lib.runSubprocess2(cmd, '.', lib.log, PENormalLog)

    return left_norm, right_norm, single_norm
Ejemplo n.º 4
0
def runKallisto(input, fasta, readTuple, stranded, cpus, output):
    '''
    function takes GFF3 output from PASA compare, extracts transcripts, and then calculates TPM
    using Kallisto to idenitfy the best scoring gene model for each locus, the left and right
    these should be the adapter cleaned non-normalized Illumina reads
    '''
    lib.log.info(
        "Using Kallisto TPM data to determine which PASA gene models to select at each locus"
    )
    #convert GFF to transcripts
    folder = os.path.join(tmpdir, 'getBestModel')
    if not os.path.exists(folder):
        os.makedirs(
            folder
        )  # handle already existing folder okay? could also delete it
    PASAtranscripts = os.path.join(folder, 'transcripts.fa')
    cmd = [
        os.path.join(PASA, 'misc_utilities', 'gff3_file_to_proteins.pl'),
        input, fasta, 'cDNA'
    ]
    lib.log.info("Building Kallisto index")
    lib.runSubprocess2(cmd, '.', lib.log, PASAtranscripts)
    #generate kallisto index
    cmd = [
        'kallisto', 'index', '-i',
        os.path.join(folder, 'bestModel'), PASAtranscripts
    ]
    lib.runSubprocess(cmd, '.', lib.log)
    #use kallisto to map reads to index
    #base command
    cmd = [
        'kallisto', 'quant', '-i',
        os.path.join(folder, 'bestModel'), '-o',
        os.path.join(folder, 'kallisto'), '--plaintext', '-t',
        str(cpus)
    ]
    #parse the strand information
    if stranded == 'RF':
        strandcmd = ['--rf-stranded']
    elif stranded == 'FR':
        strandcmd = ['--fr-stranded']
    else:
        strandcmd = []
    #adapt command for input, i.e. single or PE ends -> what do you do if you have both?
    if readTuple[2] and not readTuple[0] and not readTuple[
            1]:  #single, not just using estimated lengths and SD, I think this is okay? can make this an option otherwise
        cmd = cmd + ['--single', '-l', '200', '-s', '20', readTuple[2]]
    elif readTuple[0] and readTuple[1]:
        cmd = cmd + strandcmd + [readTuple[0], readTuple[1]]
    lib.log.info("Mapping reads using pseudoalignment in Kallisto")
    lib.runSubprocess(cmd, '.', lib.log)

    #modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from
    mRNADict = {}
    #since mRNA is unique, parse the transcript file which has mRNAID geneID in header
    with open(PASAtranscripts, 'rU') as transin:
        for line in transin:
            if line.startswith('>'):
                line = line.rstrip()
                line = line.replace('>', '')
                cols = line.split(' ')
                mRNAID = cols[0]
                geneID = cols[1]
                location = cols[-1]
                if not mRNAID in mRNADict:
                    mRNADict[mRNAID] = (geneID, location)

    #some PASA models can have incomplete CDS and are wrong, get list of incompletes to ignore list
    ignore = []
    with open(input, 'rU') as infile:
        for line in infile:
            if line.startswith('#PROT'):
                if line.endswith('\t\n'):
                    ID = line.split(' ')[1]
                    ignore.append(ID)
    if len(ignore) > 0:
        lib.log.debug("Ignoring %i incomplete PASA models: %s" %
                      (len(ignore), ','.join(ignore)))

    #now make new tsv file with #mRNAID geneID location TPM
    with open(output, 'w') as outfile:
        outfile.write("#mRNA-ID\tgene-ID\tLocation\tTPM\n")
        with open(os.path.join(folder, 'kallisto', 'abundance.tsv'),
                  'rU') as infile:
            for line in infile:
                if line.startswith('targed_id'):
                    continue
                line = line.rstrip()
                cols = line.split('\t')
                if cols[0] in ignore:
                    continue
                if cols[0] in mRNADict:
                    geneHit = mRNADict.get(cols[0])
                    geneID = geneHit[0]
                    location = geneHit[1]
                    outfile.write('%s\t%s\t%s\t%s\n' %
                                  (cols[0], geneID, location, cols[4]))
Ejemplo n.º 5
0
def runTrinityGG(genome, readTuple, output):
    '''
    function will run genome guided Trinity. First step will be to run hisat2 to align reads
    to the genome, then pass that BAM file to Trinity to generate assemblies
    '''
    #build hisat2 index, using exons and splice sites
    lib.log.info("Starting Trinity genome guided")
    lib.log.info("Building Hisat2 genome index")
    cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')]
    lib.runSubprocess4(cmd, '.', lib.log)
    #align reads using hisat2
    lib.log.info("Aligning reads to genome using Hisat2")
    hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam')
    #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM
    bamthreads = (
        args.cpus +
        2 // 2) // 2  #use half number of threads for bam compression threads
    if args.stranded != 'no' and not readTuple[2]:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness',
            args.stranded
        ]
    else:
        hisat2cmd = [
            'hisat2', '-p',
            str(args.cpus), '--max-intronlen',
            str(args.max_intronlen), '--dta', '-x',
            os.path.join(tmpdir, 'hisat2.genome')
        ]
    if readTuple[0] and readTuple[1]:
        hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]]
    if readTuple[2]:
        hisat2cmd = hisat2cmd + ['-U', readTuple[2]]

    cmd = [
        os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd),
        str(bamthreads), hisat2bam
    ]
    lib.runSubprocess(cmd, '.', lib.log)

    #now launch Trinity genome guided
    TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log')
    lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog)
    lib.log.info(
        "Clustering of reads from BAM and preparing assembly commands")
    jaccard_clip = []
    if args.jaccard_clip:
        jaccard_clip = ['--jaccard_clip']
    if args.stranded != 'no' and not readTuple[2]:
        cmd = [
            'Trinity', '--SS_lib_type', args.stranded,
            '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam,
            '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    else:
        cmd = [
            'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam',
            hisat2bam, '--genome_guided_max_intron',
            str(args.max_intronlen), '--CPU',
            str(args.cpus), '--max_memory', args.memory, '--output',
            os.path.join(tmpdir, 'trinity_gg')
        ]
    cmd = cmd + jaccard_clip
    lib.runSubprocess2(cmd, '.', lib.log, TrinityLog)
    commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds')

    #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system)
    file_list = []
    with open(commands, 'rU') as cmdFile:
        for line in cmdFile:
            line = line.replace('\n', '')
            line = line.replace(
                '--no_distributed_trinity_exec',
                '')  #don't think this should be appended to every command....
            line = line.replace('"', '')  #don't need these double quotes
            file_list.append(line)
    lib.log.info("Assembling " + "{0:,}".format(len(file_list)) +
                 " Trinity clusters using %i CPUs" % (args.cpus - 1))
    lib.runMultiProgress(safe_run, file_list, args.cpus - 1)

    #collected output files and clean
    outputfiles = os.path.join(tmpdir, 'trinity_gg',
                               'trinity_output_files.txt')
    with open(outputfiles, 'w') as fileout:
        for filename in find_files(os.path.join(tmpdir, 'trinity_gg'),
                                   '*inity.fasta'):
            fileout.write('%s\n' % filename)
    #now grab them all using Trinity script
    cmd = [
        os.path.join(TRINITY, 'util', 'support_scripts',
                     'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG'
    ]
    lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
Ejemplo n.º 6
0
                                      'iprscan' + str(os.getpid()))
            os.makedirs(IPROUT)
            #now split XML file
            splitter = os.path.join(parentdir, 'util', 'prepare_ind_xml.pl')
            cmd = [splitter, args.iprscan, IPROUT]
            lib.runSubprocess(cmd, '.', lib.log)

    #now collect the results from InterProscan, then start to reformat results
    lib.log.info(
        "InterProScan has finished, now pulling out annotations from results")
    IPR_terms = os.path.join(outputdir, 'annotate_misc',
                             'annotations.iprscan.txt')
    if not os.path.isfile(IPR_terms):
        IPR2TSV = os.path.join(parentdir, 'util', 'ipr2tsv.py')
        cmd = [sys.executable, IPR2TSV, IPROUT]
        lib.runSubprocess2(cmd, '.', lib.log, IPR_terms)
    GO_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.GO.txt')
    if not os.path.isfile(GO_terms):
        IPR2GO = os.path.join(parentdir, 'util', 'ipr2go.py')
        OBO = os.path.join(parentdir, 'DB', 'go.obo')
        cmd = [sys.executable, IPR2GO, OBO, IPROUT]
        lib.runSubprocess2(cmd, '.', lib.log, GO_terms)

#check if antiSMASH data is given, if so parse and reformat for annotations and cluster textual output
if args.antismash:
    AntiSmashFolder = os.path.join(outputdir, 'annotate_misc', 'antismash')
    AntiSmashBed = os.path.join(AntiSmashFolder, 'clusters.bed')
    GFF2clusters = os.path.join(AntiSmashFolder, 'secmet.clusters.txt')
    AntiSmash_annotations = os.path.join(outputdir, 'annotate_misc',
                                         'annotations.antismash.txt')
    Cluster_annotations = os.path.join(outputdir, 'annotate_misc',
Ejemplo n.º 7
0
                file = os.path.join(go_folder, file)
                with open(file) as input:
                    pop.write(input.read())

    #now loop through each genome comparing to population
    for f in os.listdir(go_folder):
        if f.startswith('associations'):
            continue
        if f.startswith('population'):
            continue
        file = os.path.join(go_folder, f)
        base = f.replace('.txt', '')
        goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt')
        if not lib.checkannotations(goa_out):
            cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')]
            lib.runSubprocess2(cmd, '.', lib.log, goa_out)

    #load into pandas and write to html
    with open(os.path.join(args.out, 'go.html'), 'w') as output:
        pd.set_option('display.max_colwidth', -1)
        pd.options.mode.chained_assignment = None #turn off warning
        output.write(lib.HEADER)
        output.write(lib.GO)
        for f in os.listdir(os.path.join(args.out, 'go_enrichment')):
            if f.endswith('go.enrichment.txt'):
                file = os.path.join(args.out, 'go_enrichment', f)
                base = os.path.basename(file)
                name = base.split('.go_enrichment.txt')[0]
                #check goatools output, return is a tuple with True/False and header line #
                goresult = lib.checkgoatools(file)
                output.write('<h4 class="sub-header" align="left">GO Enrichment: '+name+'</h4>')
final_proteins = os.path.join(ResultsFolder, baseOUTPUT+'.proteins.fa')
final_transcripts = os.path.join(ResultsFolder, baseOUTPUT+'.transcripts.fa')
final_fasta = os.path.join(ResultsFolder, baseOUTPUT+'.scaffolds.fa')
final_annotation = os.path.join(ResultsFolder, baseOUTPUT+'.annotations.txt')
os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.gbf'), final_gbk)
os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.gff'), os.path.join(ResultsFolder, baseOUTPUT+'.gff3'))
os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.tbl'), os.path.join(ResultsFolder, baseOUTPUT+'.tbl'))
os.rename(os.path.join(outputdir, 'annotate_misc', 'gag', 'genome.sqn'), os.path.join(ResultsFolder, baseOUTPUT+'.sqn'))
lib.gb2output(final_gbk, final_proteins, final_transcripts, final_fasta)

#write AGP output so all files in correct directory
lib.log.info("Creating AGP file and corresponding contigs file")
agp2fasta = os.path.join(parentdir, 'util', 'fasta2agp.pl')
AGP = os.path.join(ResultsFolder, baseOUTPUT+'.agp')
cmd = ['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa']
lib.runSubprocess2(cmd, ResultsFolder, lib.log, AGP)

#write secondary metabolite clusters output using the final genome in gbk format
if lib.checkannotations(antismash_input): 
    lib.log.info("Cross referencing SM cluster hits with MIBiG database")
    #do a blast best hit search against MIBiG database for cluster annotation, but looping through gene cluster hits
    AllProts = []
    for k, v in lib.dictClusters.items():
        for i in v:
            if not i in AllProts:
                AllProts.append(i)
    AllProts = set(AllProts)
    mibig_fasta = os.path.join(AntiSmashFolder, 'smcluster.proteins.fasta')
    mibig_blast = os.path.join(AntiSmashFolder, 'smcluster.MIBiG.blast.txt')
    mibig_db = os.path.join(parentdir, 'DB', 'MIBiG')
    with open(mibig_fasta, 'w') as output:
Ejemplo n.º 9
0
def runPhobiusLocal(Input):
    base = Input.split('/')[-1]
    base = base.split('.fa')[0]
    OUTPATH = os.path.join(TMPDIR, base+'.phobius')
    cmd = ['phobius.pl', '-short', Input]
    lib.runSubprocess2(cmd, TMPDIR, lib.log, OUTPATH)
Ejemplo n.º 10
0
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname,
                 output):
    '''
    function will run PASA align assembly and then choose best gene models for training
    '''
    if cpus > 2:
        pasa_cpus = cpus / 2
    else:
        pasa_cpus = 2
    #create tmpdir
    folder = os.path.join(tmpdir, 'pasa')
    if not os.path.isdir(folder):
        os.makedirs(folder)

    #create pasa and transdecoder logfiles
    pasa_log = os.path.join(folder, 'pasa.log')
    transdecoder_log = os.path.join(folder, 'transdecoder.log')

    #get config files and edit
    alignConfig = os.path.join(folder, 'alignAssembly.txt')
    pasaDBname = dbname.replace('-', '_')
    with open(alignConfig, 'w') as config1:
        with open(
                os.path.join(PASA, 'pasa_conf',
                             'pasa.alignAssembly.Template.txt'),
                'rU') as template1:
            for line in template1:
                line = line.replace('<__MYSQLDB__>', pasaDBname)
                config1.write(line)
    if not os.path.isfile(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')):
        #now run first PASA step, note this will dump any database with same name
        lib.log.info(
            "Running PASA alignment step using {:,} transcripts".format(
                lib.countfasta(transcripts)))
        cmd = [
            os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c',
            os.path.abspath(alignConfig), '-r', '-C', '-R', '-g',
            os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t',
            os.path.abspath(transcripts), '--stringent_alignment_overlap',
            args.pasa_alignment_overlap, '--TRANSDECODER',
            '--MAX_INTRON_LENGTH',
            str(intronlen), '--CPU',
            str(pasa_cpus)
        ]
        if stranded != 'no':
            cmd = cmd + ['--transcribed_is_aligned_orient']
        lib.runSubprocess(cmd, folder, lib.log)
    else:
        lib.log.info('Existing PASA assemblies found {:}'.format(
            os.path.join(folder, pasaDBname + '.assemblies.fasta')))
    #generate TSV gene-transcripts
    numLoci = getPASAtranscripts2genes(
        os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'),
        os.path.join(folder, 'pasa.gene2transcripts.tsv'))
    numTranscripts = lib.countfasta(
        os.path.join(folder, pasaDBname + '.assemblies.fasta'))
    lib.log.info(
        "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold".
        format(numTranscripts, numLoci, args.pasa_alignment_overlap))

    lib.log.info("Getting PASA models for training with TransDecoder")
    pasa_training_gff = os.path.join(
        folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3')
    if lib.which('TransDecoder.LongOrfs') and lib.which(
            'TransDecoder.Predict'):
        cmd = [
            'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta',
            '--gene_trans_map', 'pasa.gene2transcripts.tsv'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta',
            '--single_best_only'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
        cmd = [
            os.path.join(PASA, 'pasa-plugins', 'transdecoder',
                         'cdna_alignment_orf_to_genome_orf.pl'),
            pasaDBname + '.assemblies.fasta.transdecoder.gff3',
            pasaDBname + '.pasa_assemblies.gff3',
            pasaDBname + '.assemblies.fasta'
        ]
        lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff)
    else:
        cmd = [
            os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'),
            '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta',
            '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3'
        ]
        lib.runSubprocess(cmd, folder, lib.log)
    #grab final result
    shutil.copyfile(pasa_training_gff, output)