def runPhobiusRemote(Input): base = Input.split('/')[-1] base = base.split('.fa')[0] OUTPATH = os.path.join(TMPDIR, base) cmd = ['perl', os.path.join(parentdir, 'phobius-remote.pl'), '--email', args.email, '-f', 'short', '--outfile', base, Input] lib.runSubprocess(cmd, TMPDIR, lib.log) time.sleep(1) # make sure there is time for all files to show up os.rename(OUTPATH+'.out.txt', OUTPATH+'.phobius') os.remove(OUTPATH+'.sequence.txt')
def runtblastn(input, query, cpus, output, maxhits): # start by formatting blast db/dustmasker filtered format cmd = ['dustmasker', '-in', input, '-infmt', 'fasta', '-parse_seqids', '-outfmt', 'maskinfo_asn1_bin', '-out', 'genome_dust.asnb'] lib.runSubprocess(cmd, output, lib.log) cmd = ['makeblastdb', '-in', input, '-dbtype', 'nucl', '-parse_seqids', '-mask_data', 'genome_dust.asnb', '-out', 'genome'] lib.runSubprocess(cmd, output, lib.log) cmd = ['tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', str(maxhits), '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab'] lib.runSubprocess(cmd, output, lib.log)
] cmd += [ os.path.join(outputDir, 'evm.out'), os.path.join(outputDir, 'evm.out.log') ] file_list.append(cmd) # run runMultiProgress lib.runMultiProgress(safe_run, file_list, num_workers, progress=args.progress) # now combine the paritions cmd4 = [ perl, Combine, '--partitions', os.path.basename(partitions), '--output_file_name', 'evm.out' ] lib.runSubprocess(cmd4, tmpdir, lib.log) # now convert to GFF3 cmd5 = [ perl, Convert, '--partitions', os.path.basename(partitions), '--output', 'evm.out', '--genome', os.path.abspath(args.fasta) ] lib.runSubprocess(cmd5, tmpdir, lib.log) # now concatenate all GFF3 files together for a genome then lib.log.info("Converting to GFF3 and collecting all EVM results") with open(args.out, 'w') as out: for root, dirs, files in os.walk(tmpdir): for file in files: if file == 'evm.out.gff3':
logfile = input + '.log' with open(logfile, 'w') as output: subprocess.call([perl, Execute, input], stdout=output, stderr=output) def safe_run(*args, **kwargs): """Call run(), catch exceptions.""" try: worker(*args, **kwargs) except Exception as e: print("error: %s run(*%r, **%r)" % (e, args, kwargs)) # split partitions #lib.log.info("Setting up EVM partitions") lib.runSubprocess(cmd1, tmpdir, lib.log) #subprocess.call(cmd1, cwd = tmpdir, stdout = FNULL, stderr = FNULL) # check output lib.checkinputs(os.path.join(tmpdir, 'partitions_list.out')) # generate commands #lib.log.info("Generating EVM command list") commands = os.path.join(tmpdir, 'commands.list') with open(commands, 'w') as output: subprocess.call(cmd2, cwd=tmpdir, stdout=output, stderr=FNULL) # count total lines num_lines = sum(1 for line in open(commands)) # strange thing happens if you try to run with more cpus than commands if num_lines < cpus: x = num_lines
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' if not lib.checkannotations(shortBAM): # build hisat2 index, using exons and splice sites lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', '-p', str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) # align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM # use half number of threads for bam compression threads bamthreads = (args.cpus + 2 // 2) // 2 if args.stranded != 'no' and not readTuple[2]: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded] else: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join( hisat2cmd), str(bamthreads), shortBAM] lib.runSubprocess(cmd, '.', lib.log) else: lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM)) # now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no': cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] else: cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] cmd = cmd + jaccard_clip if longReads and lib.checkannotations(longReads): cmd = cmd + ['--long_reads', os.path.realpath(longReads)] lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') # this will create all the Trinity commands, will now run these in parallel using multiprocessing # in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'r') as cmdFile: for line in cmdFile: line = line.replace('\n', '') # don't think this should be appended to every command.... line = line.replace('--no_distributed_trinity_exec', '') line = line.replace('"', '') # don't need these double quotes file_list.append(line) lib.log.info("Assembling "+"{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus-1)) lib.runMultiProgress(safe_run, file_list, args.cpus-1) # collected output files and clean outputfiles = os.path.join( tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) # now grab them all using Trinity script cmd = ['perl', os.path.abspath(os.path.join( TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG'] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output) lib.log.info('{:,} transcripts derived from Trinity'.format( lib.countfasta(output)))
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", description='''Script that adds a proteome to the outgroups.''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Proteome in FASTA format') parser.add_argument('-s', '--species', required=True, help='Species name "binomial in quotes"') parser.add_argument( '-b', '--busco_db', default='dikarya', choices=[ 'fungi', 'microsporidia', 'dikarya', 'ascomycota', 'pezizomycotina', 'eurotiomycetes', 'sordariomycetes', 'saccharomycetes', 'saccharomycetales', 'basidiomycota', 'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa', 'nematoda', 'arthropoda', 'insecta', 'endopterygota', 'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii', 'tetrapoda', 'aves', 'mammalia', 'euarchontoglires', 'laurasiatheria', 'embryophyta' ], help='BUSCO database to use') parser.add_argument('-c', '--cpus', default=2, type=int, help='Number of CPUs to use') parser.add_argument('-d', '--database', help='Path to funannotate database, $FUNANNOTATE_DB') args = parser.parse_args(args) if args.database: FUNDB = args.database else: try: FUNDB = os.environ["FUNANNOTATE_DB"] except KeyError: lib.log.error( 'Funannotate database not properly configured, run funannotate setup.' ) sys.exit(1) parentdir = os.path.join(os.path.dirname(__file__)) # get base name species = args.species.replace(' ', '_').lower() + '.' + args.busco_db OUTGROUPS = os.path.join(FUNDB, 'outgroups') # create log file log_name = species + '-add2outgroups.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # check buscos, download if necessary if not os.path.isdir(os.path.join(FUNDB, args.busco_db)): lib.log.error( "%s busco database is missing, install with funannotate setup -b %s" % (args.busco_db, args.busco_db)) sys.exit(1) ProtCount = lib.countfasta(args.input) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') # convert to proteins and screen with busco lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) BUSCODB = os.path.join(FUNDB, args.busco_db) BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py') cmd = [ sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f' ] lib.runSubprocess(cmd, '.', lib.log) # check that it ran correctly busco_results = os.path.join('run_' + species, 'full_table_' + species + '.tsv') if not lib.checkannotations(busco_results): lib.log.error("BUSCO failed, check logfile") sys.exit(1) nameChange = {} with open(busco_results, 'rU') as input: for line in input: if line.startswith('#'): continue cols = line.split('\t') if cols[1] == 'Complete': if not cols[2] in nameChange: nameChange[cols[2]] = cols[0] else: lib.log.error( "Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0])) del nameChange[cols[2]] # output counts lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found') # index the proteome for parsing SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) # setup output proteome busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa') with open(busco_out, 'w') as output: for k, v in list(nameChange.items()): rec = SeqRecords[k] output.write('>%s\n%s\n' % (v, rec.seq)) lib.log.info("Results written to: %s" % busco_out) # clean up your mess shutil.rmtree('run_' + species) shutil.rmtree('tmp')