def showAll(dir): Table = [] TableHeader = [ 'Species', 'Augustus', 'GeneMark', 'Snap', 'GlimmerHMM', 'CodingQuarry', 'Date' ] for f in os.listdir(dir): ff = os.path.join(dir, f) if os.path.isdir(ff) and lib.checkannotations( os.path.join(ff, 'info.json')): with open(os.path.join(ff, 'info.json')) as infile: data = json.load(infile) sources = [f] for x in [ 'augustus', 'genemark', 'snap', 'glimmerhmm', 'codingquarry' ]: if x in data: if len(data[x][0]) < 1: sources.append('None') else: sourceFile = data[x][0]['source'] if ': ' in sourceFile: sourceFile = sourceFile.split(':')[0] sources.append(sourceFile) sources.append(data['augustus'][0]['date']) Table.append(sources) Table = natsorted(Table, key=lambda x: x[0]) Table.insert(0, TableHeader) lib.print_table(Table, max_col_width=40)
def runGOenrichment(input): basename = os.path.basename(input).replace('.txt', '') goa_out = os.path.join(args.out, basename+'.go.enrichment.txt') if not lib.checkannotations(goa_out): cmd = ['find_enrichment.py', '--obo', os.path.join(FUNDB, 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', '--outfile', goa_out, input, os.path.join(args.input, 'population.txt'), os.path.join(args.input, 'associations.txt')] subprocess.call(cmd, stdout=FNULL, stderr=FNULL)
def speciesAvailable(dir): # return dictionary of species name and path to info.json file Results = {} for f in os.listdir(dir): ff = os.path.join(dir, f) if os.path.isdir(ff) and lib.checkannotations(os.path.join(ff, 'info.json')): with open(os.path.join(ff, 'info.json')) as infile: data = json.load(infile) Results[f] = data return Results
def runGOenrichment(input): basename = os.path.basename(input).replace('.txt', '') goa_out = os.path.join(args.out, basename + '.go.enrichment.txt') go_log = os.path.join(args.out, basename + '.go.enrichment.log') if not lib.checkannotations(goa_out): cmd = [ 'find_enrichment.py', '--obo', os.path.join(FUNDB, 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', '--outfile', goa_out, input, os.path.join(args.input, 'population.txt'), os.path.join(args.input, 'associations.txt') ] with open(go_log, 'w') as outfile: outfile.write('{}\n'.format(' '.join(cmd))) with open(go_log, 'a') as outfile: subprocess.call(cmd, stdout=outfile, stderr=outfile)
if args.cpus > len(scaffolds): num = len(scaffolds) else: num = args.cpus lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % (len(scaffolds), num)) lib.runMultiProgress(runAugustus, scaffolds, num) lib.log.debug("Augustus prediction is finished, now concatenating results") with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output: for file in scaffolds: file = os.path.join(tmpdir, file + '.augustus.gff3') with open(file) as input: output.write(input.read()) if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')): lib.log.debug('Augustus finished, now joining results') if lib.which_path('join_aug_pred.pl'): join_script = 'join_aug_pred.pl' else: join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl') cmd = '{:} < {:} > {:}'.format(join_script, os.path.join(tmpdir, 'augustus_all.gff3'), args.out) lib.log.debug(cmd) with open(args.out, 'w') as finalout: with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'r') as infile: subprocess.call([join_script], stdin=infile, stdout=finalout)
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-remote.py', description= '''Script that adds functional annotation to a genome using remote searches.''', epilog="""Written by Jon Palmer (2016-2017) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', help='Folder from funannotate predict.') parser.add_argument('-g', '--genbank', help='Annotated genome in GenBank format') parser.add_argument('-m', '--methods', required=True, nargs='+', choices=['all', 'phobius', 'antismash'], help='Method to run') parser.add_argument('-o', '--out', help='Basename of output files') parser.add_argument('-e', '--email', required=True, help='Email address for IPRSCAN server') parser.add_argument('--force', action='store_true', help='Over-write output folder') parser.add_argument('-a', '--antismash', default='fungi', choices=['fungi', 'plants'], help='antiSMASH server') args = parser.parse_args(args) global parentdir, RUNIPRSCAN, XMLCombine parentdir = os.path.join(os.path.dirname(__file__)) RUNIPRSCAN = os.path.join(parentdir, 'aux_scripts', 'runIPRscan.py') XMLCombine = os.path.join(parentdir, 'aux_scripts', 'xmlcombine.py') # create log file log_name = 'funannotate-remote.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print "-------------------------------------------------------" lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # need to do some checks here of the input genbank = '' Proteins = '' tablefile = '' Fastafile = '' if not args.input: # did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder if not args.out: lib.log.error( "If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)" ) sys.exit(1) else: outputdir = args.out # create outputdir and subdirs if not os.path.isdir(outputdir): os.makedirs(outputdir) os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) os.makedirs(os.path.join(outputdir, 'logfiles')) if not args.genbank: lib.log.error( "You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank" ) sys.exit(1) else: # create output directories if not os.path.isdir(outputdir): os.makedirs(outputdir) os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) os.makedirs(os.path.join(outputdir, 'logfiles')) else: lib.log.error("Output directory %s already exists" % (outputdir)) if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')): os.makedirs(os.path.join(outputdir, 'annotate_misc')) if not os.path.isdir( os.path.join(outputdir, 'annotate_results')): os.makedirs(os.path.join(outputdir, 'annotate_results')) if not os.path.isdir(os.path.join(outputdir, 'logfiles')): os.makedirs(os.path.join(outputdir, 'logfiles')) genbank = args.genbank Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta') Proteins = os.path.join(outputdir, 'annotate_misc', 'genome.proteins.fasta') Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.transcripts.fasta') GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3') lib.log.info("Checking GenBank file for annotation") if not lib.checkGenBank(genbank): lib.log.error("Found no annotation in GenBank file, exiting") sys.exit(1) lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds) else: # should be a folder, with funannotate files, thus store results there, no need to create output folder if not os.path.isdir(args.input): lib.log.error("%s directory does not exist" % args.input) sys.exit(1) # funannotate results should be here if os.path.isdir(os.path.join(args.input, 'update_results')): inputdir = os.path.join(args.input, 'update_results') outputdir = args.input elif os.path.isdir(os.path.join(args.input, 'predict_results')): inputdir = os.path.join(args.input, 'predict_results') outputdir = args.input else: # here user specified the predict_results folder, or it is a custom folder inputdir = os.path.join(args.input) # get files that you need for file in os.listdir(inputdir): if file.endswith('.gbk'): genbank = os.path.join(inputdir, file) elif file.endswith('.tbl'): tablefile = os.path.join(inputdir, file) elif file.endswith('.scaffolds.fa'): Fastafile = os.path.join(inputdir, file) # now create the files from genbank input file for consistency in gene naming, etc if not genbank: lib.log.error( "Properly formatted 'funannotate predict' files do no exist in this directory" ) sys.exit(1) else: # if user gave predict_results folder, then set output to up one directory if 'predict_results' in inputdir or 'update_results' in inputdir: outputdir = lib.get_parent_dir(inputdir) else: if not args.out: outputdir = inputdir # output the results in the input directory else: outputdir = args.out if not os.path.isdir(outputdir): os.makedirs(outputdir) # create output directories if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')): os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) else: lib.log.error( "Output directory %s already exists, will use any existing data. If this is not what you want, exit, and provide a unique name for output folder" % (outputdir)) lib.log.info("Parsing input files") Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta') Proteins = os.path.join(outputdir, 'annotate_misc', 'genome.proteins.fasta') Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.mrna-transcripts.fasta') CDSTranscripts = os.path.join(outputdir, 'annotate_misc', 'genome.cds-transcripts.fasta') GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3') if tablefile and Fastafile: lib.log.debug("Generating files from %s" % tablefile) lib.tbl2allout(tablefile, Fastafile, GFF, Proteins, Transcripts, CDSTranscripts, Scaffolds) else: lib.log.debug("Generating files from %s" % genbank) lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds) # make sure logfiles directory is present, will need later if not os.path.isdir(os.path.join(outputdir, 'logfiles')): os.makedirs(os.path.join(outputdir, 'logfiles')) # get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here Proteins = os.path.abspath(Proteins) genbank = os.path.abspath(genbank) if 'phobius' in args.methods or 'all' in args.methods: # run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt') phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log') lib.log.info( "Predicting secreted and transmembrane proteins using Phobius") if not lib.checkannotations(phobius_out): if args.email: subprocess.call([ os.path.join(parentdir, 'aux_scripts', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-e', str(args.email), '-l', phobiusLog ]) else: subprocess.call([ os.path.join(parentdir, 'aux_scripts', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog ]) if 'antismash' in args.methods or 'all' in args.methods: if args.antismash == 'fungi': base_address = "https://fungismash.secondarymetabolites.org" job_parameters = { 'email': args.email, 'ncbi': '', 'smcogs': 'on', 'knownclusterblast': 'on', 'activesitefinder': 'on', 'subclusterblast': 'on', 'jobtype': 'antismash5', 'hmmdetection_strictness': 'relaxed' } elif args.antismash == 'plants': base_address = "https://plantismash.secondarymetabolites.org" job_parameters = { 'email': args.email, 'knownclusterblast': 'on', 'subclusterblast': 'on' } version = requests.get(base_address + "/api/v1.0/version") as_vers = version.json()['antismash_generation'] tax = version.json()['taxon'] as_status = requests.get(base_address + "/api/v1.0/stats") queue = as_status.json()['queue_length'] running = as_status.json()['running'] lib.log.info("Connecting to antiSMASH %s v%s webserver" % (tax, as_vers)) lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running)) lib.log.info("PLEASE to not abuse the webserver, be considerate!") if int(queue) > 10 and not args.force: lib.log.error( "There are more than 10 antiSMASH jobs in queue, use --force to submit anyway" ) sys.exit(1) job_files = {'seq': open(genbank, 'rb')} lib.log.info("Uploading %s to webserver" % genbank) postjob = requests.post(base_address + "/api/v1.0/submit", files=job_files, data=job_parameters) jobid = postjob.json()['id'] # now we can query the job every so often, not sure what is reasonable here, start with 2 minutes? lib.log.info("Waiting for results from job: %s" % jobid) while True: job_status = requests.get(base_address + "/api/v1.0/status/" + jobid) if job_status.json()['status'] == 'done': break time.sleep(60) # check every minute result_url = job_status.json()['result_url'] base_url = result_url.replace('index.html', '') lib.log.info("antiSMASH v%s job finished" % (as_vers)) lib.log.debug("%s" % job_status.json()) # need to retrieve results, have to find link, seems like this might be first scaffold name? # after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it job_html = requests.get(base_address + result_url) link = None for line in job_html.iter_lines(): if 'Download all results' in line: cols = line.split('a href="') for x in cols: if '.zip' in x: link = x.split('"')[0] if not link: lib.log.error('Error parsing output zip file from antismash') sys.exit(1) baselink = link.replace('.zip', '') download_url = base_address + base_url + link download(download_url, 'antiSMASH.zip') # now unzip and move folder zipref = zipfile.ZipFile('antiSMASH.zip', 'r') zipref.extractall(os.path.join(outputdir, jobid)) zipref.close() os.remove('antiSMASH.zip') lib.log.info("Results folder: %s/%s" % (outputdir, jobid)) # now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection anti_GBK = os.path.join(outputdir, jobid, os.path.basename(genbank)) final = os.path.join(outputdir, 'annotate_misc', 'antiSMASH.results.gbk') shutil.copyfile(anti_GBK, final) lib.log.info("Results GBK: %s" % final) lib.log.info("Remote searches complete") # move logfile if os.path.isfile(log_name): shutil.copyfile(log_name, os.path.join(outputdir, 'logfiles', log_name)) os.remove(log_name)
def runExonerate(input): s = input.split(':::') ProtID = s[0] ScaffID = s[1] ScaffStart = int(s[2]) ScaffEnd = int(s[3]) # get the protein model query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa') with open(query, 'w') as output: SeqIO.write(protein_dict[ProtID], output, 'fasta') # now get the genome region, use different variable names for SeqRecords to avoid collision scaffold = os.path.join( tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' + str(ScaffEnd) + '.fa') with open(scaffold, 'w') as output2: with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'), 'rU') as fullscaff: for header, Sequence in SimpleFastaParser(fullscaff): # grab a 3 kb cushion on either side of hit region, careful of scaffold ends start = ScaffStart - 3000 if start < 1: start = 1 end = ScaffEnd + 3000 if end > len(Sequence): end = len(Sequence) output2.write('>%s\n%s\n' % (header, Sequence[start:end])) exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__' # check that input files are created and valid exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out') ryo = "AveragePercentIdentity: %pi\n" cmd = [ 'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold ] if lib.checkannotations(query) and lib.checkannotations(scaffold): # run exonerate, capture errors with open(exonerate_out, 'w') as output3: proc = subprocess.Popen(cmd, stdout=output3, stderr=subprocess.PIPE) stderr = proc.communicate() if 'WARNING' in stderr[1]: lib.log.debug('Error in input:{:}'.format(input)) lib.log.debug( '%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end)) os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query))) os.rename( scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold))) else: for y in [query, scaffold]: try: lib.SafeRemove(y) except OSError: lib.log.debug("Error removing %s" % (y)) # check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes if lib.getSize(exonerate_out) < 500: os.remove(exonerate_out) else: lib.log.debug('Error in query or scaffold:{:}'.format(input)) lib.SafeRemove(query) lib.SafeRemove(scaffold)
def runTrinityGG(genome, readTuple, longReads, shortBAM, output, args=False): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' if not lib.checkannotations(shortBAM): # build hisat2 index, using exons and splice sites lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', '-p', str(args.cpus), genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) # align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") # use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM # use half number of threads for bam compression threads bamthreads = (args.cpus + 2 // 2) // 2 if args.stranded != 'no' and not readTuple[2]: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded] else: hisat2cmd = ['hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome')] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [os.path.join(parentdir, 'sam2bam.sh'), " ".join( hisat2cmd), str(bamthreads), shortBAM] lib.runSubprocess(cmd, '.', lib.log) else: lib.log.info('Existig Hisat2 alignments found: {:}'.format(shortBAM)) # now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no': cmd = ['Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] else: cmd = ['Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', shortBAM, '--genome_guided_max_intron', str( args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg')] cmd = cmd + jaccard_clip if longReads and lib.checkannotations(longReads): cmd = cmd + ['--long_reads', os.path.realpath(longReads)] lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') # this will create all the Trinity commands, will now run these in parallel using multiprocessing # in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'r') as cmdFile: for line in cmdFile: line = line.replace('\n', '') # don't think this should be appended to every command.... line = line.replace('--no_distributed_trinity_exec', '') line = line.replace('"', '') # don't need these double quotes file_list.append(line) lib.log.info("Assembling "+"{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus-1)) lib.runMultiProgress(safe_run, file_list, args.cpus-1) # collected output files and clean outputfiles = os.path.join( tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) # now grab them all using Trinity script cmd = ['perl', os.path.abspath(os.path.join( TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl')), 'Trinity_GG'] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output) lib.log.info('{:,} transcripts derived from Trinity'.format( lib.countfasta(output)))
def RepeatModelMask(input, cpus, tmpdir, output, repeatlib, species, debug): lib.log.info("Loading sequences and soft-masking genome") outdir = os.path.join(tmpdir, 'RepeatModeler') input = os.path.abspath(input) output = os.path.abspath(output) # lets run RepeatModeler here to get repeat library if os.path.exists(outdir): shutil.rmtree(outdir) os.makedirs(outdir) lib.log.info("Soft-masking: building RepeatModeler database") with open(debug, 'a') as debug_log: subprocess.call( ['BuildDatabase', '-engine', 'ncbi', '-name', 'Repeats', input], cwd=outdir, stdout=debug_log, stderr=debug_log) lib.log.info("Soft-masking: generating repeat library using RepeatModeler") with open(debug, 'a') as debug_log: subprocess.call( ['RepeatModeler', '-database', 'Repeats', '-pa', str(cpus)], cwd=outdir, stdout=debug_log, stderr=debug_log) # find name of folder RP_folder = '.' for i in os.listdir(outdir): if i.startswith('RM_'): RP_folder = i library = os.path.abspath(repeatlib) if lib.checkannotations( os.path.join(outdir, RP_folder, 'consensi.fa.classified')): shutil.copyfile( os.path.join(outdir, RP_folder, 'consensi.fa.classified'), library) # now soft-mask the genome for gene predictors outdir2 = os.path.join(tmpdir, 'RepeatMasker') if os.path.isdir(outdir2): shutil.rmtree(outdir2) os.makedirs(outdir2) if not os.path.isfile(library): lib.log.info( "Soft-masking: running RepeatMasker with default library (RepeatModeler found 0 models)" ) with open(debug, 'a') as debug_log: subprocess.call([ 'RepeatMasker', '-e', 'ncbi', '-gff', '-species', species, '-pa', str(cpus), '-xsmall', '-dir', '.', input ], cwd=outdir2, stdout=debug_log, stderr=debug_log) else: lib.log.info("Soft-masking: running RepeatMasker with custom library") with open(debug, 'a') as debug_log: subprocess.call([ 'RepeatMasker', '-e', 'ncbi', '-gff', '-lib', library, '-pa', str(cpus), '-xsmall', '-dir', '.', input ], cwd=outdir2, stdout=debug_log, stderr=debug_log) for file in os.listdir(outdir2): if file.endswith('.masked'): shutil.copyfile(os.path.join(outdir2, file), output)
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-mask.py', description='''Wrapper for RepeatModeler/RepeatMasker''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='genome assembly FASTA format') parser.add_argument('-o', '--out', required=True, help='Output softmasked FASTA file') parser.add_argument('--debug', action='store_true', help='Keep intermediate files') parser.add_argument('-m', '--method', default='tantan', choices=['repeatmodeler', 'repeatmasker', 'tantan'], help='Method to mask repeats with') parser.add_argument('-s', '--repeatmasker_species', help='RepeatMasker species, will skip repeatmodeler') parser.add_argument( '-l', '--repeatmodeler_lib', help='Pre-computed RepeatModeler (or other) repetitive elements') parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use') args = parser.parse_args(args) # create log file for Repeats(capture stderr) log_name = 'funannotate-mask.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running funanotate v{:}".format(version)) repeats = None tmpdir = None if args.method == 'tantan': programs = ['tantan'] lib.CheckDependencies(programs) lib.log.info('Soft-masking simple repeats with tantan') runTanTan(args.input, args.out) else: programs = ['RepeatMasker'] if args.method == 'repeatmodeler': programs += ['BuildDatabase', 'RepeatModeler'] lib.CheckDependencies(programs) # create tmpdir pid = uuid.uuid4() tmpdir = 'mask_' + str(pid) os.makedirs(tmpdir) # parse options which dictates how repeatmodeler/masker are run if not args.repeatmodeler_lib: # no fasta file given, so if not args.repeatmasker_species: # no species given, so run entire repeatmodler + repeat masker repeats = 'repeatmodeler-library.' + str(pid) + '.fasta' RepeatModelMask(args.input, args.cpus, tmpdir, args.out, repeats, args.repeatmasker_species, log_name) else: RepeatMaskSpecies(args.input, args.repeatmasker_species, args.cpus, tmpdir, args.out, log_name) else: if lib.checkannotations(args.repeatmodeler_lib): RepeatMask(args.input, args.repeatmodeler_lib, args.cpus, tmpdir, args.out, log_name) else: lib.log.error( 'ERROR: repeat library is not a valid file: {:}'.format( args.repeatmodeler_lib)) sys.exit(1) # output some stats on %reads masked. scaffolds = 0 maskedSize = 0 GenomeLength = 0 with open(args.out, 'r') as input: for rec, Seq in SimpleFastaParser(input): scaffolds += 1 GenomeLength += len(Seq) maskedSize += lib.n_lower_chars(Seq) percentMask = maskedSize / float(GenomeLength) lib.log.info( 'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)' .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize, percentMask * 100)) if repeats: lib.log.info('RepeatModeler library: {:}'.format(repeats)) # clean up if not args.debug: if tmpdir: lib.SafeRemove(tmpdir) print("-------------------------------------------------------")
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='gbk2parts.py', description='''Script to convert GBK file to its components.''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--tbl', required=True, help='Genome annotation in tbl format') parser.add_argument('-f', '--fasta', required=True, help='Genome in FASTA format') parser.add_argument( '-s', '--species', required=True, help= 'Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space' ) parser.add_argument('--isolate', help='Isolate name (e.g. Af293)') parser.add_argument('--strain', help='Strain name (e.g. CEA10)') parser.add_argument( '-t', '--tbl2asn', help='Custom parameters for tbl2asn, example: linkage and gap info') parser.add_argument('--sbt', help='tbl2asn template file') parser.add_argument('-o', '--output', help='Output basename') args = parser.parse_args(args) parentdir = os.path.dirname(lib.__file__) # see if organism/species/isolate was passed at command line organism = None if args.species: organism = args.species else: organism = os.path.basename(args.tbl).split('.t')[0] if args.strain: organism_name = organism + '_' + args.strain elif args.isolate: organism_name = organism + '_' + args.isolate else: organism_name = organism organism_name = organism_name.replace(' ', '_') if args.output: outputname = args.output else: outputname = organism_name # create tmp folder to run tbl2asn from # make tmp folder tmp = outputname + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) # now move files into proper location if not lib.checkannotations(args.fasta): print(('FASTA genome file not found: {:}'.format(args.fasta))) sys.exit(1) if not lib.checkannotations(args.tbl): print(('TBL annotations file not found: {:}'.format(args.tbl))) sys.exit(1) shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa')) shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl')) # now we can run tbl2asn if args.sbt: SBT = args.sbt else: SBT = os.path.join(parentdir, 'config', 'test.sbt') discrep = outputname + '.discrepency.txt' version = 1 runtbl2asn(tmp, SBT, discrep, organism, args.isolate, args.strain, args.tbl2asn, version) # check the output for errors for NCBI final_fixes = os.path.join(tmp, 'models-need-fixing.txt') prefix = locustagGB(os.path.join(tmp, 'genome.gbf')) errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'), os.path.join(tmp, 'genome.val'), prefix, final_fixes) # get output files gbkout = outputname + '.gbk' shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout) if errors < 1: lib.SafeRemove(tmp)
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", description='''Script that adds a proteome to the outgroups.''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Proteome in FASTA format') parser.add_argument('-s', '--species', required=True, help='Species name "binomial in quotes"') parser.add_argument( '-b', '--busco_db', default='dikarya', choices=[ 'fungi', 'microsporidia', 'dikarya', 'ascomycota', 'pezizomycotina', 'eurotiomycetes', 'sordariomycetes', 'saccharomycetes', 'saccharomycetales', 'basidiomycota', 'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa', 'nematoda', 'arthropoda', 'insecta', 'endopterygota', 'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii', 'tetrapoda', 'aves', 'mammalia', 'euarchontoglires', 'laurasiatheria', 'embryophyta' ], help='BUSCO database to use') parser.add_argument('-c', '--cpus', default=2, type=int, help='Number of CPUs to use') parser.add_argument('-d', '--database', help='Path to funannotate database, $FUNANNOTATE_DB') args = parser.parse_args(args) if args.database: FUNDB = args.database else: try: FUNDB = os.environ["FUNANNOTATE_DB"] except KeyError: lib.log.error( 'Funannotate database not properly configured, run funannotate setup.' ) sys.exit(1) parentdir = os.path.join(os.path.dirname(__file__)) # get base name species = args.species.replace(' ', '_').lower() + '.' + args.busco_db OUTGROUPS = os.path.join(FUNDB, 'outgroups') # create log file log_name = species + '-add2outgroups.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # check buscos, download if necessary if not os.path.isdir(os.path.join(FUNDB, args.busco_db)): lib.log.error( "%s busco database is missing, install with funannotate setup -b %s" % (args.busco_db, args.busco_db)) sys.exit(1) ProtCount = lib.countfasta(args.input) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') # convert to proteins and screen with busco lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) BUSCODB = os.path.join(FUNDB, args.busco_db) BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py') cmd = [ sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f' ] lib.runSubprocess(cmd, '.', lib.log) # check that it ran correctly busco_results = os.path.join('run_' + species, 'full_table_' + species + '.tsv') if not lib.checkannotations(busco_results): lib.log.error("BUSCO failed, check logfile") sys.exit(1) nameChange = {} with open(busco_results, 'rU') as input: for line in input: if line.startswith('#'): continue cols = line.split('\t') if cols[1] == 'Complete': if not cols[2] in nameChange: nameChange[cols[2]] = cols[0] else: lib.log.error( "Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0])) del nameChange[cols[2]] # output counts lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found') # index the proteome for parsing SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) # setup output proteome busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa') with open(busco_out, 'w') as output: for k, v in list(nameChange.items()): rec = SeqRecords[k] output.write('>%s\n%s\n' % (v, rec.seq)) lib.log.info("Results written to: %s" % busco_out) # clean up your mess shutil.rmtree('run_' + species) shutil.rmtree('tmp')
default=1, type=int, help='location of HMM database') parser.add_argument('-o', '--out', required=True, help='output file') args = parser.parse_args() global FUNDB, FNULL FUNDB = args.db FNULL = open(os.devnull, 'w') # now loop through each genome comparing to population file_list = [] for f in os.listdir(args.input): if f.startswith('associations'): continue if f.startswith('population'): continue file = os.path.join(args.input, f) if lib.checkannotations(file): file_list.append(file) else: print(' WARNING: skipping {} as no GO terms'.format(f)) # run over multiple CPUs if len(file_list) > args.cpus: procs = args.cpus else: procs = len(file_list) lib.runMultiNoProgress(GO_safe_run, file_list, procs)