def Fzip_inplace(input, cpus): ''' function to zip as fast as it can, pigz -> bgzip -> gzip ''' if lib.which('pigz'): cmd = ['pigz', '-f', '-p', str(cpus), input] elif lib.which('bgzip'): cmd = ['bgzip', '-f', '-@', str(cpus), input] else: cmd = ['gzip', '-f', input] try: lib.runSubprocess(cmd, '.', lib.log) except NameError: subprocess.call(cmd)
def choose_aligner(): ''' function to choose alignment method for mapping reads to transcripts to determine orientation of the trinity transcripts. rapmap -> bowtie2 -> hisat2 note hisat2 is probably not ideal for this, but should work okay. ''' aligner = '' if lib.which('rapmap'): aligner = 'rapmap' elif lib.which('bowtie2'): aligner = 'bowtie2' else: aligner = 'hisat2' return aligner
#run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt') phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log') lib.log.info("Predicting secreted and transmembrane proteins using Phobius") if not lib.checkannotations(phobius_out): subprocess.call([ os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-e', args.email, '-l', phobiusLog ]) #run signalP if installed, have to manually install, so test if exists first, then run it if it does, parse results signalp_out = os.path.join(outputdir, 'annotate_misc', 'signalp.results.txt') secreted_out = os.path.join(outputdir, 'annotate_misc', 'annotations.secretome.txt') membrane_out = os.path.join(outputdir, 'annotate_misc', 'annotations.transmembrane.txt') if lib.which('signalp'): lib.log.info("Predicting secreted proteins with SignalP") if not lib.checkannotations(signalp_out): lib.signalP(Proteins, os.path.join(outputdir, 'annotate_misc'), signalp_out) lib.parsePhobiusSignalP(phobius_out, signalp_out, membrane_out, secreted_out) else: lib.log.info( "SignalP not installed, secretome prediction less accurate using only Phobius" ) lib.parsePhobiusSignalP(phobius_out, False, membrane_out, secreted_out) num_secreted = lib.line_count(secreted_out) num_mem = lib.line_count(membrane_out) lib.log.info('{0:,}'.format(num_secreted) + ' secretome and ' + '{0:,}'.format(num_mem) + ' transmembane annotations added')
#run BUSCO OGS search busco_out = os.path.join(outputdir, 'annotate_misc', 'annotations.busco.txt') lib.log.info("Annotating proteins with BUSCO %s models" % args.busco_db) buscoDB = os.path.join(parentdir, 'DB', args.busco_db) if not lib.checkannotations(busco_out): lib.runBUSCO(Proteins, buscoDB, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out) num_annotations = lib.line_count(busco_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run Phobius if local is installed, otherwise use funannotate remote phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt') phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log') if args.phobius: phobius_out = args.phobius else: if lib.which('phobius.pl'): if not lib.checkannotations(phobius_out): lib.log.info("Predicting secreted and transmembrane proteins using Phobius") subprocess.call([os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog]) else: if lib.checkannotations(phobius_out): lib.log.info("Found phobius pre-computed results") else: lib.log.info("Skipping phobius predictions, try funannotate remote -m phobius") #run signalP if installed, have to manually install, so test if exists first, then run it if it does, parse results signalp_out = os.path.join(outputdir, 'annotate_misc', 'signalp.results.txt') secreted_out = os.path.join(outputdir, 'annotate_misc', 'annotations.secretome.txt') membrane_out = os.path.join(outputdir, 'annotate_misc', 'annotations.transmembrane.txt') if lib.which('signalp'): lib.log.info("Predicting secreted proteins with SignalP") if not lib.checkannotations(signalp_out):
lib.log.debug(cmd_args) #create tmpdir to store fasta files and output files TMPDIR = 'phobius_' + str(os.getpid()) #split fasta lib.splitFASTA(args.input, TMPDIR) #now get list of files in tmpdir proteins = [] for file in os.listdir(TMPDIR): if file.endswith('.fa'): proteins.append(file) #now run the script if lib.which('phobius.pl'): lib.runMultiProgress(runPhobiusLocal, proteins, multiprocessing.cpu_count()) else: lib.runMultiProgress(runPhobiusRemote, proteins, 29) #max is 30 jobs at a time #collect all results phobius = [] for file in os.listdir(TMPDIR): if file.endswith('.phobius'): phobius.append(os.path.join(TMPDIR,file)) #write output TMdomain = 0 SigPep = 0 with open(args.out, 'w') as output: output.write("%s\t%s\t%s\t%s\n" % ('ID', 'TM', 'SP', 'Prediction'))
lib.log.info("Annotating proteins with EggNog 4.5 database") if not lib.checkannotations(eggnog_out): lib.runEggNog(Proteins, os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm'), os.path.join(parentdir, 'DB', args.eggnog_db+'.annotations.tsv'), args.cpus, 1e-10, os.path.join(outputdir, 'annotate_misc'), eggnog_out) num_annotations = lib.line_count(eggnog_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run BUSCO OGS search busco_out = os.path.join(outputdir, 'annotate_misc', 'annotations.busco.txt') lib.log.info("Annotating proteins with BUSCO %s models" % args.busco_db) buscoDB = os.path.join(parentdir, 'DB', args.busco_db) if not lib.checkannotations(busco_out): lib.runBUSCO(Proteins, buscoDB, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out) num_annotations = lib.line_count(busco_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run signalP if installed, have to manually install, so test if exists first, then run it if it does signalp_out = os.path.join(outputdir, 'annotate_misc', 'annotations.signalp.txt') if lib.which('signalp'): lib.log.info("Predicting secreted proteins with SignalP") if not lib.checkannotations(signalp_out): lib.signalP(Proteins, os.path.join(outputdir, 'annotate_misc'), signalp_out) num_annotations = lib.line_count(signalp_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') else: lib.log.info("SignalP not installed, skipping") if not args.skip_iprscan: if not args.iprscan: #run interpro scan IPROUT = os.path.join(outputdir, 'annotate_misc', 'iprscan') PROTS = os.path.join(outputdir, 'annotate_misc', 'protein_tmp') for i in IPROUT,PROTS: if not os.path.exists(i):
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #create pasa and transdecoder logfiles pasa_log = os.path.join(folder, 'pasa.log') transdecoder_log = os.path.join(folder, 'transdecoder.log') #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') with open(alignConfig, 'w') as config1: with open( os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__MYSQLDB__>', pasaDBname) config1.write(line) if not os.path.isfile( os.path.join(folder, pasaDBname + '.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info( "Running PASA alignment step using {:,} transcripts".format( lib.countfasta(transcripts))) cmd = [ os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus) ] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found {:}'.format( os.path.join(folder, pasaDBname + '.assemblies.fasta'))) #generate TSV gene-transcripts numLoci = getPASAtranscripts2genes( os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'), os.path.join(folder, 'pasa.gene2transcripts.tsv')) numTranscripts = lib.countfasta( os.path.join(folder, pasaDBname + '.assemblies.fasta')) lib.log.info( "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold". format(numTranscripts, numLoci, args.pasa_alignment_overlap)) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join( folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3') if lib.which('TransDecoder.LongOrfs') and lib.which( 'TransDecoder.Predict'): cmd = [ 'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta', '--gene_trans_map', 'pasa.gene2transcripts.tsv' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ 'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta', '--single_best_only' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ os.path.join(PASA, 'pasa-plugins', 'transdecoder', 'cdna_alignment_orf_to_genome_orf.pl'), pasaDBname + '.assemblies.fasta.transdecoder.gff3', pasaDBname + '.pasa_assemblies.gff3', pasaDBname + '.assemblies.fasta' ] lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff) else: cmd = [ os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3' ] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output)