def runtbl2asn(folder, template, discrepency, organism, isolate, strain, parameters, version): ''' function to run NCBI tbl2asn ''' # get funannotate version fun_version = lib.get_version() # input should be a folder if not os.path.isdir(folder): print(("tbl2asn error: %s is not a directory, exiting" % folder)) sys.exit(1) # based on organism, isolate, strain, construct meta info for -j flag if not organism: print("tbl2asn error: organism not specified") sys.exit(1) meta = "[organism=" + organism + "]" if isolate: isolate_meta = "[isolate=" + isolate + "]" meta = meta + " " + isolate_meta if strain: strain_meta = "[strain=" + strain + "]" meta = meta + " " + strain_meta cmd = [ 'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N', str(version), '-p', folder, '-t', template, '-M', 'n', '-Z', discrepency, '-j', '"' + meta + '"', '-V', 'b', '-c', 'fx', '-T', '-a', 'r10u' ] # check for custom parameters if parameters: params = parameters.split(' ') cmd = cmd + params runSubprocess(cmd, '.') return ' '.join(cmd)
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='fix', usage="%(prog)s [options] -i genome.GBK -t genome.tbl", description= '''Script will update annotation of a Genbank file with new tbl.''', epilog="""Written by Jon Palmer (2017) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Genome in GBK format') parser.add_argument('-t', '--tbl', required=True, help='Genome annotation in NCBI tbl format') parser.add_argument( '-d', '--drop', help='List of locus_tag to remove/drop from annotation') parser.add_argument('-o', '--out', help='Basename of output files') parser.add_argument('--tbl2asn', default='-l paired-ends', help='Parameters for tbl2asn, linkage and gap info') args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(__file__)) # create log file log_name = 'funannotate-fix.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # create output and temporary directory if args.out: basedir = args.out else: # get location from tbl file basedir = os.path.dirname(args.tbl) if basedir == '': basedir = '.' if not os.path.isdir(basedir): os.makedirs(basedir) if not os.path.isdir(os.path.join(basedir, 'tbl2asn')): os.makedirs(os.path.join(basedir, 'tbl2asn')) # copy over the annotation file to tbl2asn folder, or process if args.drop passed if args.drop: lib.tblfilter(args.tbl, args.drop, os.path.join(basedir, 'tbl2asn', 'genome.tbl')) else: shutil.copyfile(args.tbl, os.path.join(basedir, 'tbl2asn', 'genome.tbl')) # get information info from GBK file organism, strain, isolate, accession, WGS_accession, gb_gi, version = lib.getGBKinfo( args.input) locustag, genenum, justify = lib.getGBKLocusTag(args.input) if strain: organism_name = organism + '_' + strain elif isolate: organism_name = organism + '_' + isolate else: organism_name = organism organism_name = organism_name.replace(' ', '_') # extract fasta file from genbank file, lib.log.info('Extracting genome sequence and parsing meta information') contigs, genes, trnas = lib.countGenBank(args.input) lib.log.info( '{:,} contigs containing {:,} protein coding genes and {:,} tRNA genes' .format(contigs, genes, trnas)) lib.gb2dna(args.input, os.path.join(basedir, 'tbl2asn', 'genome.fsa')) # assuming that this is the predict_results dir or update_results dir, but check first and then archive if '_results' in basedir: archivedir = os.path.join(basedir, 'archive_' + str(os.getpid())) lib.log.info('Found pre-existing funannotate files, archiving to %s' % archivedir) os.makedirs(archivedir) # move files in results to archive dir for file in os.listdir(basedir): if 'pasa-reannotation' in file or 'WGS_accession' in file or 'ncbi.p2g' in file or '.parameters.json' in file: continue if os.path.isfile(os.path.join(basedir, file)): os.rename(os.path.join(basedir, file), os.path.join(archivedir, file)) # now we can run tbl2asn SBT = os.path.join(parentdir, 'config', 'test.sbt') discrep = os.path.join(basedir, organism_name + '.discrepency.txt') if not version: version = 1 lib.log.info('Converting to GenBank format') # have to run as subprocess because of multiprocessing issues cmd = [ sys.executable, os.path.join(parentdir, 'aux_scripts', 'tbl2asn_parallel.py'), '-i', os.path.join(basedir, 'tbl2asn', 'genome.tbl'), '-f', os.path.join(basedir, 'tbl2asn', 'genome.fsa'), '-o', os.path.join(basedir, 'tbl2asn'), '--sbt', SBT, '-d', discrep, '-s', organism, '-t', args.tbl2asn, '-v', str(version), '-c', '4' ] if isolate: cmd += ['--isolate', isolate] if strain: cmd += ['--strain', strain] lib.log.debug(' '.join(cmd)) subprocess.call(cmd) # now get GBK files from folder lib.log.info('Generating output files.') # setup final output files final_fasta = os.path.join(basedir, organism_name + '.scaffolds.fa') final_gff = os.path.join(basedir, organism_name + '.gff3') final_gbk = os.path.join(basedir, organism_name + '.gbk') final_tbl = os.path.join(basedir, organism_name + '.tbl') final_proteins = os.path.join(basedir, organism_name + '.proteins.fa') final_transcripts = os.path.join(basedir, organism_name + '.mrna-transcripts.fa') final_cds_transcripts = os.path.join(basedir, organism_name + '.cds-transcripts.fa') final_validation = os.path.join(basedir, organism_name + '.validation.txt') final_error = os.path.join(basedir, organism_name + '.error.summary.txt') final_fixes = os.path.join(basedir, organism_name + '.models-need-fixing.txt') # retrieve files/reorganize shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.gbf'), final_gbk) shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.tbl'), final_tbl) shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'genome.val'), final_validation) shutil.copyfile(os.path.join(basedir, 'tbl2asn', 'errorsummary.val'), final_error) lib.tbl2allout(final_tbl, os.path.join(basedir, 'tbl2asn', 'genome.fsa'), final_gff, final_proteins, final_transcripts, final_cds_transcripts, final_fasta) errors = lib.ncbiCheckErrors(final_error, final_validation, locustag, final_fixes) if errors > 0: lib.log.info( "Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n" % (final_tbl, final_gbk, final_tbl)) else: contigs, genes, trnas = lib.countGenBank(final_gbk) lib.log.info( 'Output genome consists of: {:,} contigs containing {:,} protein coding genes and {:,} tRNA genes' .format(contigs, genes, trnas)) # clean up shutil.rmtree(os.path.join(basedir, 'tbl2asn'))
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-remote.py', description= '''Script that adds functional annotation to a genome using remote searches.''', epilog="""Written by Jon Palmer (2016-2017) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', help='Folder from funannotate predict.') parser.add_argument('-g', '--genbank', help='Annotated genome in GenBank format') parser.add_argument('-m', '--methods', required=True, nargs='+', choices=['all', 'phobius', 'antismash'], help='Method to run') parser.add_argument('-o', '--out', help='Basename of output files') parser.add_argument('-e', '--email', required=True, help='Email address for IPRSCAN server') parser.add_argument('--force', action='store_true', help='Over-write output folder') parser.add_argument('-a', '--antismash', default='fungi', choices=['fungi', 'plants'], help='antiSMASH server') args = parser.parse_args(args) global parentdir, RUNIPRSCAN, XMLCombine parentdir = os.path.join(os.path.dirname(__file__)) RUNIPRSCAN = os.path.join(parentdir, 'aux_scripts', 'runIPRscan.py') XMLCombine = os.path.join(parentdir, 'aux_scripts', 'xmlcombine.py') # create log file log_name = 'funannotate-remote.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print "-------------------------------------------------------" lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # need to do some checks here of the input genbank = '' Proteins = '' tablefile = '' Fastafile = '' if not args.input: # did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder if not args.out: lib.log.error( "If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)" ) sys.exit(1) else: outputdir = args.out # create outputdir and subdirs if not os.path.isdir(outputdir): os.makedirs(outputdir) os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) os.makedirs(os.path.join(outputdir, 'logfiles')) if not args.genbank: lib.log.error( "You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank" ) sys.exit(1) else: # create output directories if not os.path.isdir(outputdir): os.makedirs(outputdir) os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) os.makedirs(os.path.join(outputdir, 'logfiles')) else: lib.log.error("Output directory %s already exists" % (outputdir)) if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')): os.makedirs(os.path.join(outputdir, 'annotate_misc')) if not os.path.isdir( os.path.join(outputdir, 'annotate_results')): os.makedirs(os.path.join(outputdir, 'annotate_results')) if not os.path.isdir(os.path.join(outputdir, 'logfiles')): os.makedirs(os.path.join(outputdir, 'logfiles')) genbank = args.genbank Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta') Proteins = os.path.join(outputdir, 'annotate_misc', 'genome.proteins.fasta') Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.transcripts.fasta') GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3') lib.log.info("Checking GenBank file for annotation") if not lib.checkGenBank(genbank): lib.log.error("Found no annotation in GenBank file, exiting") sys.exit(1) lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds) else: # should be a folder, with funannotate files, thus store results there, no need to create output folder if not os.path.isdir(args.input): lib.log.error("%s directory does not exist" % args.input) sys.exit(1) # funannotate results should be here if os.path.isdir(os.path.join(args.input, 'update_results')): inputdir = os.path.join(args.input, 'update_results') outputdir = args.input elif os.path.isdir(os.path.join(args.input, 'predict_results')): inputdir = os.path.join(args.input, 'predict_results') outputdir = args.input else: # here user specified the predict_results folder, or it is a custom folder inputdir = os.path.join(args.input) # get files that you need for file in os.listdir(inputdir): if file.endswith('.gbk'): genbank = os.path.join(inputdir, file) elif file.endswith('.tbl'): tablefile = os.path.join(inputdir, file) elif file.endswith('.scaffolds.fa'): Fastafile = os.path.join(inputdir, file) # now create the files from genbank input file for consistency in gene naming, etc if not genbank: lib.log.error( "Properly formatted 'funannotate predict' files do no exist in this directory" ) sys.exit(1) else: # if user gave predict_results folder, then set output to up one directory if 'predict_results' in inputdir or 'update_results' in inputdir: outputdir = lib.get_parent_dir(inputdir) else: if not args.out: outputdir = inputdir # output the results in the input directory else: outputdir = args.out if not os.path.isdir(outputdir): os.makedirs(outputdir) # create output directories if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')): os.makedirs(os.path.join(outputdir, 'annotate_misc')) os.makedirs(os.path.join(outputdir, 'annotate_results')) else: lib.log.error( "Output directory %s already exists, will use any existing data. If this is not what you want, exit, and provide a unique name for output folder" % (outputdir)) lib.log.info("Parsing input files") Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta') Proteins = os.path.join(outputdir, 'annotate_misc', 'genome.proteins.fasta') Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.mrna-transcripts.fasta') CDSTranscripts = os.path.join(outputdir, 'annotate_misc', 'genome.cds-transcripts.fasta') GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3') if tablefile and Fastafile: lib.log.debug("Generating files from %s" % tablefile) lib.tbl2allout(tablefile, Fastafile, GFF, Proteins, Transcripts, CDSTranscripts, Scaffolds) else: lib.log.debug("Generating files from %s" % genbank) lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds) # make sure logfiles directory is present, will need later if not os.path.isdir(os.path.join(outputdir, 'logfiles')): os.makedirs(os.path.join(outputdir, 'logfiles')) # get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here Proteins = os.path.abspath(Proteins) genbank = os.path.abspath(genbank) if 'phobius' in args.methods or 'all' in args.methods: # run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt') phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log') lib.log.info( "Predicting secreted and transmembrane proteins using Phobius") if not lib.checkannotations(phobius_out): if args.email: subprocess.call([ os.path.join(parentdir, 'aux_scripts', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-e', str(args.email), '-l', phobiusLog ]) else: subprocess.call([ os.path.join(parentdir, 'aux_scripts', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog ]) if 'antismash' in args.methods or 'all' in args.methods: if args.antismash == 'fungi': base_address = "https://fungismash.secondarymetabolites.org" job_parameters = { 'email': args.email, 'ncbi': '', 'smcogs': 'on', 'knownclusterblast': 'on', 'activesitefinder': 'on', 'subclusterblast': 'on', 'jobtype': 'antismash5', 'hmmdetection_strictness': 'relaxed' } elif args.antismash == 'plants': base_address = "https://plantismash.secondarymetabolites.org" job_parameters = { 'email': args.email, 'knownclusterblast': 'on', 'subclusterblast': 'on' } version = requests.get(base_address + "/api/v1.0/version") as_vers = version.json()['antismash_generation'] tax = version.json()['taxon'] as_status = requests.get(base_address + "/api/v1.0/stats") queue = as_status.json()['queue_length'] running = as_status.json()['running'] lib.log.info("Connecting to antiSMASH %s v%s webserver" % (tax, as_vers)) lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running)) lib.log.info("PLEASE to not abuse the webserver, be considerate!") if int(queue) > 10 and not args.force: lib.log.error( "There are more than 10 antiSMASH jobs in queue, use --force to submit anyway" ) sys.exit(1) job_files = {'seq': open(genbank, 'rb')} lib.log.info("Uploading %s to webserver" % genbank) postjob = requests.post(base_address + "/api/v1.0/submit", files=job_files, data=job_parameters) jobid = postjob.json()['id'] # now we can query the job every so often, not sure what is reasonable here, start with 2 minutes? lib.log.info("Waiting for results from job: %s" % jobid) while True: job_status = requests.get(base_address + "/api/v1.0/status/" + jobid) if job_status.json()['status'] == 'done': break time.sleep(60) # check every minute result_url = job_status.json()['result_url'] base_url = result_url.replace('index.html', '') lib.log.info("antiSMASH v%s job finished" % (as_vers)) lib.log.debug("%s" % job_status.json()) # need to retrieve results, have to find link, seems like this might be first scaffold name? # after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it job_html = requests.get(base_address + result_url) link = None for line in job_html.iter_lines(): if 'Download all results' in line: cols = line.split('a href="') for x in cols: if '.zip' in x: link = x.split('"')[0] if not link: lib.log.error('Error parsing output zip file from antismash') sys.exit(1) baselink = link.replace('.zip', '') download_url = base_address + base_url + link download(download_url, 'antiSMASH.zip') # now unzip and move folder zipref = zipfile.ZipFile('antiSMASH.zip', 'r') zipref.extractall(os.path.join(outputdir, jobid)) zipref.close() os.remove('antiSMASH.zip') lib.log.info("Results folder: %s/%s" % (outputdir, jobid)) # now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection anti_GBK = os.path.join(outputdir, jobid, os.path.basename(genbank)) final = os.path.join(outputdir, 'annotate_misc', 'antiSMASH.results.gbk') shutil.copyfile(anti_GBK, final) lib.log.info("Results GBK: %s" % final) lib.log.info("Remote searches complete") # move logfile if os.path.isfile(log_name): shutil.copyfile(log_name, os.path.join(outputdir, 'logfiles', log_name)) os.remove(log_name)
def main(args): funannotate_perl = [ 'Getopt::Long', 'Pod::Usage', 'File::Basename', 'threads', 'threads::shared', 'Thread::Queue', 'Carp', 'Data::Dumper', 'YAML', 'Hash::Merge', 'Logger::Simple', 'Parallel::ForkManager', 'DBI', 'Text::Soundex', 'Scalar::Util::Numeric', 'Tie::File', 'POSIX', 'Storable', 'Clone', 'Bio::Perl', 'DBD::mysql', 'JSON', 'LWP::UserAgent', 'DB_File', 'URI::Escape', 'File::Which', 'DBD::SQLite' ] funannotate_python = [ 'numpy', 'pandas', 'matplotlib', 'scipy', 'scikit-learn', 'psutil', 'natsort', 'goatools', 'seaborn', 'biopython', 'requests' ] programs1 = ['tblastn', 'makeblastdb', 'java', 'trimmomatic'] # -version programs2 = [ 'exonerate', 'bedtools', 'bamtools', 'augustus', 'samtools', 'gmap', 'hisat2', 'Trinity', 'tbl2asn', 'emapper.py', 'minimap2', 'mafft', 'trimal', 'stringtie', 'salmon', 'proteinortho', 'tantan' ] # --version programs3 = [] # -v programs4 = ['diamond', 'ete3', 'kallisto'] # version programs5 = [ 'gmes_petap.pl', 'blat', 'pslCDnaFilter', 'fasta', 'CodingQuarry', 'snap', 'glimmerhmm' ] # no version option at all, a$$holes programs6 = ['hmmsearch', 'hmmscan', 'tRNAscan-SE'] # -h programs7 = ['signalp'] # -V PyVers = sys.version.split(' ')[0] PerlVers = perlVersion() PyDeps = {} PerlDeps = {} ExtDeps = {} # loop through lists and build dictionary of results so you can print out later print("-------------------------------------------------------") print("Checking dependencies for %s" % lib.get_version()) print("-------------------------------------------------------") global show show = False if '--show-versions' in sys.argv: show = True else: print( "To print all dependencies and versions: funannotate check --show-versions\n" ) print('You are running Python v %s. Now checking python packages...' % PyVers) for mod in funannotate_python: if not mod in PyDeps: PyDeps[mod] = checkPyModule(mod) missing = [] for k, v in natsorted(PyDeps.items()): if not v: missing.append(k) elif show: print(k + ': ' + v) if len(missing) > 0: for x in missing: print( ' ERROR: %s not installed, pip install %s or conda install %s' % (x, x, x)) else: print("All %i python packages installed" % len(funannotate_python)) print("\n") for mod in funannotate_perl: if not mod in PerlDeps: PerlDeps[mod] = checkPerlModule(mod) missing = [] print('You are running Perl v %s. Now checking perl modules...' % PerlVers) for k, v in natsorted(PerlDeps.items()): if not v: missing.append(k) elif show: print(k + ': ' + v) if len(missing) > 0: for x in missing: print(' ERROR: %s not installed, install with cpanm %s ' % (x, x)) else: print("All %i Perl modules installed" % len(funannotate_perl)) print("\n") # check ENV variables variables = [ 'FUNANNOTATE_DB', 'PASAHOME', 'TRINITYHOME', 'EVM_HOME', 'AUGUSTUS_CONFIG_PATH', 'GENEMARK_PATH' ] print('Checking Environmental Variables...') missing = [] for var in variables: try: VARI = os.environ[var] if show: print('$%s=%s' % (var, VARI)) except KeyError: if var == 'TRINITYHOME': try: VARI = os.environ['TRINITY_HOME'] if show: print('$%s=%s' % ('TRINITY_HOME', VARI)) except KeyError: missing.append(var) else: missing.append(var) pass if len(missing) > 0: for x in missing: print('\tERROR: %s not set. export %s=/path/to/dir' % (x, x)) else: print("All %i environmental variables are set" % (len(variables))) print("-------------------------------------------------------") if not 'PASAHOME' in missing: LAUNCHPASA = os.path.join(os.environ['PASAHOME'], 'Launch_PASA_pipeline.pl') programs2.append(LAUNCHPASA) print('Checking external dependencies...') for prog in programs1: if not prog in ExtDeps: ExtDeps[prog] = check_version1(prog) for prog in programs2: if not prog in ExtDeps: ExtDeps[prog] = check_version2(prog) for prog in programs3: if not prog in ExtDeps: ExtDeps[prog] = check_version3(prog) for prog in programs4: if not prog in ExtDeps: ExtDeps[prog] = check_version4(prog) for prog in programs5: if not prog in ExtDeps: ExtDeps[prog] = check_version5(prog) for prog in programs6: if not prog in ExtDeps: ExtDeps[prog] = check_version6(prog) for prog in programs7: if not prog in ExtDeps: ExtDeps[prog] = check_version7(prog) missing = [] for k, v in natsorted(ExtDeps.items()): if not v or v.startswith('dyld:'): missing.append(k) elif show: if 'Launch_PASA_pipeline.pl' in k: k = 'PASA' print(k + ': ' + v) if len(missing) > 0: for x in missing: print('\tERROR: %s not installed' % (x)) else: print("All %i external dependencies are installed\n" % (len(ExtDeps)))
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-mask.py', description='''Wrapper for RepeatModeler/RepeatMasker''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='genome assembly FASTA format') parser.add_argument('-o', '--out', required=True, help='Output softmasked FASTA file') parser.add_argument('--debug', action='store_true', help='Keep intermediate files') parser.add_argument('-m', '--method', default='tantan', choices=['repeatmodeler', 'repeatmasker', 'tantan'], help='Method to mask repeats with') parser.add_argument('-s', '--repeatmasker_species', help='RepeatMasker species, will skip repeatmodeler') parser.add_argument( '-l', '--repeatmodeler_lib', help='Pre-computed RepeatModeler (or other) repetitive elements') parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use') args = parser.parse_args(args) # create log file for Repeats(capture stderr) log_name = 'funannotate-mask.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running funanotate v{:}".format(version)) repeats = None tmpdir = None if args.method == 'tantan': programs = ['tantan'] lib.CheckDependencies(programs) lib.log.info('Soft-masking simple repeats with tantan') runTanTan(args.input, args.out) else: programs = ['RepeatMasker'] if args.method == 'repeatmodeler': programs += ['BuildDatabase', 'RepeatModeler'] lib.CheckDependencies(programs) # create tmpdir pid = uuid.uuid4() tmpdir = 'mask_' + str(pid) os.makedirs(tmpdir) # parse options which dictates how repeatmodeler/masker are run if not args.repeatmodeler_lib: # no fasta file given, so if not args.repeatmasker_species: # no species given, so run entire repeatmodler + repeat masker repeats = 'repeatmodeler-library.' + str(pid) + '.fasta' RepeatModelMask(args.input, args.cpus, tmpdir, args.out, repeats, args.repeatmasker_species, log_name) else: RepeatMaskSpecies(args.input, args.repeatmasker_species, args.cpus, tmpdir, args.out, log_name) else: if lib.checkannotations(args.repeatmodeler_lib): RepeatMask(args.input, args.repeatmodeler_lib, args.cpus, tmpdir, args.out, log_name) else: lib.log.error( 'ERROR: repeat library is not a valid file: {:}'.format( args.repeatmodeler_lib)) sys.exit(1) # output some stats on %reads masked. scaffolds = 0 maskedSize = 0 GenomeLength = 0 with open(args.out, 'r') as input: for rec, Seq in SimpleFastaParser(input): scaffolds += 1 GenomeLength += len(Seq) maskedSize += lib.n_lower_chars(Seq) percentMask = maskedSize / float(GenomeLength) lib.log.info( 'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)' .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize, percentMask * 100)) if repeats: lib.log.info('RepeatModeler library: {:}'.format(repeats)) # clean up if not args.debug: if tmpdir: lib.SafeRemove(tmpdir) print("-------------------------------------------------------")
def runtbl2asn_parallel(folder, template, discrepency, organism, isolate, strain, parameters, version, cpus): ''' function to run NCBI tbl2asn ''' # make sure ouput that will be appended to is not there for file in [ os.path.join(folder, 'genome.val'), os.path.join(folder, 'errorsummary.val'), os.path.join(folder, 'genome.gbf'), discrepency ]: lib.SafeRemove(file) # get funannotate version fun_version = lib.get_version() # input should be a folder if not os.path.isdir(folder): lib.log.error("tbl2asn error: %s is not a directory, exiting" % folder) sys.exit(1) # based on organism, isolate, strain, construct meta info for -j flag if not organism: lib.log.error("tbl2asn error: organism not specified") sys.exit(1) meta = "[organism=" + organism + "]" if isolate: isolate_meta = "[isolate=" + isolate + "]" meta = meta + " " + isolate_meta if strain: strain_meta = "[strain=" + strain + "]" meta = meta + " " + strain_meta cmd = [ 'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N', str(version), '-t', template, '-M', 'n', '-j', '"' + meta + '"', '-V', 'b', '-c', 'f', '-T', '-a', 'r10u' ] # check for custom parameters if parameters: params = parameters.split(' ') cmd = cmd + params # check for folders in the input folder, if present, run tbl2asn on each folder and then combine multiple = [] for file in os.listdir(folder): if os.path.isdir(os.path.join(folder, file)): multiple.append(os.path.join(folder, file)) if len(multiple) == 0: multiple.append(folder) p = multiprocessing.Pool(cpus) results = [] for i in multiple: results.append(p.apply_async(tbl2asn_safe_run, (cmd, i))) p.close() p.join() # now collect the results make in main folder # first delete any of the outputs you might be appending to with open(os.path.join(folder, 'genome.val'), 'a') as validation: with open(discrepency, 'a') as discrep: with open(os.path.join(folder, 'errorsummary.val'), 'a') as summary: with open(os.path.join(folder, 'genome.gbf'), 'a') as genbank: for dirName, subdirList, fileList in os.walk( folder, topdown=False): if len(subdirList) > 0: continue for f in fileList: if f == 'errorsummary.val': with open(os.path.join(dirName, f)) as infile: summary.write(infile.read()) elif f.endswith('.val'): with open(os.path.join(dirName, f)) as infile: validation.write(infile.read()) elif f.endswith('.gbf'): with open(os.path.join(dirName, f)) as infile: genbank.write(infile.read()) elif f.endswith('.tbl'): shutil.copyfile(os.path.join(dirName, f), os.path.join(folder, f)) elif f.endswith('.sqn'): shutil.copyfile(os.path.join(dirName, f), os.path.join(folder, f)) elif f == 'discrepency.report.txt': with open(os.path.join(dirName, f)) as infile: discrep.write(infile.read())
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-predict.py', usage="%(prog)s [options] -i genome.fasta", description='''Script that adds a proteome to the outgroups.''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Proteome in FASTA format') parser.add_argument('-s', '--species', required=True, help='Species name "binomial in quotes"') parser.add_argument( '-b', '--busco_db', default='dikarya', choices=[ 'fungi', 'microsporidia', 'dikarya', 'ascomycota', 'pezizomycotina', 'eurotiomycetes', 'sordariomycetes', 'saccharomycetes', 'saccharomycetales', 'basidiomycota', 'eukaryota', 'protists', 'alveolata_stramenophiles', 'metazoa', 'nematoda', 'arthropoda', 'insecta', 'endopterygota', 'hymenoptera', 'diptera', 'vertebrata', 'actinopterygii', 'tetrapoda', 'aves', 'mammalia', 'euarchontoglires', 'laurasiatheria', 'embryophyta' ], help='BUSCO database to use') parser.add_argument('-c', '--cpus', default=2, type=int, help='Number of CPUs to use') parser.add_argument('-d', '--database', help='Path to funannotate database, $FUNANNOTATE_DB') args = parser.parse_args(args) if args.database: FUNDB = args.database else: try: FUNDB = os.environ["FUNANNOTATE_DB"] except KeyError: lib.log.error( 'Funannotate database not properly configured, run funannotate setup.' ) sys.exit(1) parentdir = os.path.join(os.path.dirname(__file__)) # get base name species = args.species.replace(' ', '_').lower() + '.' + args.busco_db OUTGROUPS = os.path.join(FUNDB, 'outgroups') # create log file log_name = species + '-add2outgroups.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) # check buscos, download if necessary if not os.path.isdir(os.path.join(FUNDB, args.busco_db)): lib.log.error( "%s busco database is missing, install with funannotate setup -b %s" % (args.busco_db, args.busco_db)) sys.exit(1) ProtCount = lib.countfasta(args.input) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') # convert to proteins and screen with busco lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) BUSCODB = os.path.join(FUNDB, args.busco_db) BUSCO = os.path.join(parentdir, 'aux_scripts', 'funannotate-BUSCO2.py') cmd = [ sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f' ] lib.runSubprocess(cmd, '.', lib.log) # check that it ran correctly busco_results = os.path.join('run_' + species, 'full_table_' + species + '.tsv') if not lib.checkannotations(busco_results): lib.log.error("BUSCO failed, check logfile") sys.exit(1) nameChange = {} with open(busco_results, 'rU') as input: for line in input: if line.startswith('#'): continue cols = line.split('\t') if cols[1] == 'Complete': if not cols[2] in nameChange: nameChange[cols[2]] = cols[0] else: lib.log.error( "Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0])) del nameChange[cols[2]] # output counts lib.log.info('{0:,}'.format(len(nameChange)) + ' BUSCO models found') # index the proteome for parsing SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) # setup output proteome busco_out = os.path.join(OUTGROUPS, species + '_buscos.fa') with open(busco_out, 'w') as output: for k, v in list(nameChange.items()): rec = SeqRecords[k] output.write('>%s\n%s\n' % (v, rec.seq)) lib.log.info("Results written to: %s" % busco_out) # clean up your mess shutil.rmtree('run_' + species) shutil.rmtree('tmp')