def runExonerate(input): s = input.split(':::') ProtID = s[0] ScaffID = s[1] ScaffStart = int(s[2]) ScaffEnd = int(s[3]) # get the protein model query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa') with open(query, 'w') as output: SeqIO.write(protein_dict[ProtID], output, 'fasta') # now get the genome region, use different variable names for SeqRecords to avoid collision scaffold = os.path.join( tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' + str(ScaffEnd) + '.fa') with open(scaffold, 'w') as output2: with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'), 'rU') as fullscaff: for header, Sequence in SimpleFastaParser(fullscaff): # grab a 3 kb cushion on either side of hit region, careful of scaffold ends start = ScaffStart - 3000 if start < 1: start = 1 end = ScaffEnd + 3000 if end > len(Sequence): end = len(Sequence) output2.write('>%s\n%s\n' % (header, Sequence[start:end])) exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__' # check that input files are created and valid exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out') ryo = "AveragePercentIdentity: %pi\n" cmd = [ 'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold ] if lib.checkannotations(query) and lib.checkannotations(scaffold): # run exonerate, capture errors with open(exonerate_out, 'w') as output3: proc = subprocess.Popen(cmd, stdout=output3, stderr=subprocess.PIPE) stderr = proc.communicate() if 'WARNING' in stderr[1]: lib.log.debug('Error in input:{:}'.format(input)) lib.log.debug( '%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end)) os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query))) os.rename( scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold))) else: for y in [query, scaffold]: try: lib.SafeRemove(y) except OSError: lib.log.debug("Error removing %s" % (y)) # check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes if lib.getSize(exonerate_out) < 500: os.remove(exonerate_out) else: lib.log.debug('Error in query or scaffold:{:}'.format(input)) lib.SafeRemove(query) lib.SafeRemove(scaffold)
cols[4] = str(int(cols[4]) + offset) output.write('\t'.join(cols)) # convert to GFF3 using ExoConverter from EVM with open(args.out, 'w') as output: subprocess.call([ExoConverter, exonerate_raw], stdout=output, stderr=FNULL) # output some quick summary of exonerate alignments that you found Found = lib.countGFFgenes(exonerate_raw) lib.log.info('Exonerate finished: found {:,} alignments'.format(Found)) # check for saving output of tblastn if args.tblastn_out: shutil.copyfile(BlastResult, args.tblastn_out) # finally clean-up your mess if failed is empty if args.debug: try: os.rmdir(os.path.join(tmpdir, 'failed')) empty = True except OSError: empty = False if empty: lib.SafeRemove(tmpdir) else: lib.log.error("Failed exonerate alignments found, see files in %s" % os.path.join(tmpdir, 'failed')) else: if os.path.isdir(tmpdir): lib.SafeRemove(tmpdir)
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='funannotate-mask.py', description='''Wrapper for RepeatModeler/RepeatMasker''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='genome assembly FASTA format') parser.add_argument('-o', '--out', required=True, help='Output softmasked FASTA file') parser.add_argument('--debug', action='store_true', help='Keep intermediate files') parser.add_argument('-m', '--method', default='tantan', choices=['repeatmodeler', 'repeatmasker', 'tantan'], help='Method to mask repeats with') parser.add_argument('-s', '--repeatmasker_species', help='RepeatMasker species, will skip repeatmodeler') parser.add_argument( '-l', '--repeatmodeler_lib', help='Pre-computed RepeatModeler (or other) repetitive elements') parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use') args = parser.parse_args(args) # create log file for Repeats(capture stderr) log_name = 'funannotate-mask.log' if os.path.isfile(log_name): os.remove(log_name) # initialize script, log system info and cmd issue at runtime lib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' lib.log.debug(cmd_args) print("-------------------------------------------------------") lib.SystemInfo() # get version of funannotate version = lib.get_version() lib.log.info("Running funanotate v{:}".format(version)) repeats = None tmpdir = None if args.method == 'tantan': programs = ['tantan'] lib.CheckDependencies(programs) lib.log.info('Soft-masking simple repeats with tantan') runTanTan(args.input, args.out) else: programs = ['RepeatMasker'] if args.method == 'repeatmodeler': programs += ['BuildDatabase', 'RepeatModeler'] lib.CheckDependencies(programs) # create tmpdir pid = uuid.uuid4() tmpdir = 'mask_' + str(pid) os.makedirs(tmpdir) # parse options which dictates how repeatmodeler/masker are run if not args.repeatmodeler_lib: # no fasta file given, so if not args.repeatmasker_species: # no species given, so run entire repeatmodler + repeat masker repeats = 'repeatmodeler-library.' + str(pid) + '.fasta' RepeatModelMask(args.input, args.cpus, tmpdir, args.out, repeats, args.repeatmasker_species, log_name) else: RepeatMaskSpecies(args.input, args.repeatmasker_species, args.cpus, tmpdir, args.out, log_name) else: if lib.checkannotations(args.repeatmodeler_lib): RepeatMask(args.input, args.repeatmodeler_lib, args.cpus, tmpdir, args.out, log_name) else: lib.log.error( 'ERROR: repeat library is not a valid file: {:}'.format( args.repeatmodeler_lib)) sys.exit(1) # output some stats on %reads masked. scaffolds = 0 maskedSize = 0 GenomeLength = 0 with open(args.out, 'r') as input: for rec, Seq in SimpleFastaParser(input): scaffolds += 1 GenomeLength += len(Seq) maskedSize += lib.n_lower_chars(Seq) percentMask = maskedSize / float(GenomeLength) lib.log.info( 'Repeat soft-masking finished: \nMasked genome: {:}\nnum scaffolds: {:,}\nassembly size: {:,} bp\nmasked repeats: {:,} bp ({:.2f}%)' .format(os.path.abspath(args.out), scaffolds, GenomeLength, maskedSize, percentMask * 100)) if repeats: lib.log.info('RepeatModeler library: {:}'.format(repeats)) # clean up if not args.debug: if tmpdir: lib.SafeRemove(tmpdir) print("-------------------------------------------------------")
def main(args): # setup menu with argparse class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): def __init__(self, prog): super(MyFormatter, self).__init__(prog, max_help_position=48) parser = argparse.ArgumentParser( prog='gbk2parts.py', description='''Script to convert GBK file to its components.''', epilog="""Written by Jon Palmer (2018) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--tbl', required=True, help='Genome annotation in tbl format') parser.add_argument('-f', '--fasta', required=True, help='Genome in FASTA format') parser.add_argument( '-s', '--species', required=True, help= 'Species name (e.g. "Aspergillus fumigatus") use quotes if there is a space' ) parser.add_argument('--isolate', help='Isolate name (e.g. Af293)') parser.add_argument('--strain', help='Strain name (e.g. CEA10)') parser.add_argument( '-t', '--tbl2asn', help='Custom parameters for tbl2asn, example: linkage and gap info') parser.add_argument('--sbt', help='tbl2asn template file') parser.add_argument('-o', '--output', help='Output basename') args = parser.parse_args(args) parentdir = os.path.dirname(lib.__file__) # see if organism/species/isolate was passed at command line organism = None if args.species: organism = args.species else: organism = os.path.basename(args.tbl).split('.t')[0] if args.strain: organism_name = organism + '_' + args.strain elif args.isolate: organism_name = organism + '_' + args.isolate else: organism_name = organism organism_name = organism_name.replace(' ', '_') if args.output: outputname = args.output else: outputname = organism_name # create tmp folder to run tbl2asn from # make tmp folder tmp = outputname + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) # now move files into proper location if not lib.checkannotations(args.fasta): print(('FASTA genome file not found: {:}'.format(args.fasta))) sys.exit(1) if not lib.checkannotations(args.tbl): print(('TBL annotations file not found: {:}'.format(args.tbl))) sys.exit(1) shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa')) shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl')) # now we can run tbl2asn if args.sbt: SBT = args.sbt else: SBT = os.path.join(parentdir, 'config', 'test.sbt') discrep = outputname + '.discrepency.txt' version = 1 runtbl2asn(tmp, SBT, discrep, organism, args.isolate, args.strain, args.tbl2asn, version) # check the output for errors for NCBI final_fixes = os.path.join(tmp, 'models-need-fixing.txt') prefix = locustagGB(os.path.join(tmp, 'genome.gbf')) errors = ncbiCheckErrors(os.path.join(tmp, 'errorsummary.val'), os.path.join(tmp, 'genome.val'), prefix, final_fixes) # get output files gbkout = outputname + '.gbk' shutil.copyfile(os.path.join(tmp, 'genome.gbf'), gbkout) if errors < 1: lib.SafeRemove(tmp)
def runtbl2asn_parallel(folder, template, discrepency, organism, isolate, strain, parameters, version, cpus): ''' function to run NCBI tbl2asn ''' # make sure ouput that will be appended to is not there for file in [ os.path.join(folder, 'genome.val'), os.path.join(folder, 'errorsummary.val'), os.path.join(folder, 'genome.gbf'), discrepency ]: lib.SafeRemove(file) # get funannotate version fun_version = lib.get_version() # input should be a folder if not os.path.isdir(folder): lib.log.error("tbl2asn error: %s is not a directory, exiting" % folder) sys.exit(1) # based on organism, isolate, strain, construct meta info for -j flag if not organism: lib.log.error("tbl2asn error: organism not specified") sys.exit(1) meta = "[organism=" + organism + "]" if isolate: isolate_meta = "[isolate=" + isolate + "]" meta = meta + " " + isolate_meta if strain: strain_meta = "[strain=" + strain + "]" meta = meta + " " + strain_meta cmd = [ 'tbl2asn', '-y', '"Annotated using ' + fun_version + '"', '-N', str(version), '-t', template, '-M', 'n', '-j', '"' + meta + '"', '-V', 'b', '-c', 'f', '-T', '-a', 'r10u' ] # check for custom parameters if parameters: params = parameters.split(' ') cmd = cmd + params # check for folders in the input folder, if present, run tbl2asn on each folder and then combine multiple = [] for file in os.listdir(folder): if os.path.isdir(os.path.join(folder, file)): multiple.append(os.path.join(folder, file)) if len(multiple) == 0: multiple.append(folder) p = multiprocessing.Pool(cpus) results = [] for i in multiple: results.append(p.apply_async(tbl2asn_safe_run, (cmd, i))) p.close() p.join() # now collect the results make in main folder # first delete any of the outputs you might be appending to with open(os.path.join(folder, 'genome.val'), 'a') as validation: with open(discrepency, 'a') as discrep: with open(os.path.join(folder, 'errorsummary.val'), 'a') as summary: with open(os.path.join(folder, 'genome.gbf'), 'a') as genbank: for dirName, subdirList, fileList in os.walk( folder, topdown=False): if len(subdirList) > 0: continue for f in fileList: if f == 'errorsummary.val': with open(os.path.join(dirName, f)) as infile: summary.write(infile.read()) elif f.endswith('.val'): with open(os.path.join(dirName, f)) as infile: validation.write(infile.read()) elif f.endswith('.gbf'): with open(os.path.join(dirName, f)) as infile: genbank.write(infile.read()) elif f.endswith('.tbl'): shutil.copyfile(os.path.join(dirName, f), os.path.join(folder, f)) elif f.endswith('.sqn'): shutil.copyfile(os.path.join(dirName, f), os.path.join(folder, f)) elif f == 'discrepency.report.txt': with open(os.path.join(dirName, f)) as infile: discrep.write(infile.read())
def split_tbl2asn(folder): ''' function to chunk the genome and annotation files into parts if > 10,000 contigs to conform to NCBI recommendations and avoid the 2GB threshold of sequin files ''' numSeqs = 0 genomeSize = 0 with open(os.path.join(folder, 'genome.fsa'), 'r') as fastain: for Header, Seq in SimpleFastaParser(fastain): numSeqs += 1 genomeSize += len(Seq) # if less than 10,000 contigs and less than 100 MB, then don't split and just run it if numSeqs < 10000 and genomeSize < int(100e6): # move to subfolder for multiprocessing to work correctly if os.path.isdir(os.path.join(folder, '1')): lib.SafeRemove(os.path.join(folder, '1')) os.makedirs(os.path.join(folder, '1')) shutil.copyfile(os.path.join(folder, 'genome.fsa'), os.path.join(folder, '1', 'genome.fsa')) shutil.copyfile(os.path.join(folder, 'genome.tbl'), os.path.join(folder, '1', 'genome.tbl')) else: # rounded_up = -(-numerator // denominator) #nice trick to round up if genomeSize > int(100e6): chunks = -(-genomeSize // int(100e6)) # split into 100 MB chunks else: chunks = -(-numSeqs // 10000) Records = [] with open(os.path.join(folder, 'genome.fsa'), 'r') as fastain: for tup in SimpleFastaParser(fastain): Records.append(tup) # sort the fasta tuples by size Records = sorted(Records, key=lambda x: len(x[1]), reverse=True) # shuffle them into lists like dealing playing cards then all chunks have similar sizes sliced_records = list_slice(Records, chunks) # loop through and add headers to dictionary for tbl splitting lookup headers = {} for i, x in enumerate(sliced_records): if os.path.isdir(os.path.join(folder, str(i + 1))): lib.SafeRemove(os.path.join(folder, str(i + 1))) os.makedirs(os.path.join(folder, str(i + 1))) with open( os.path.join(folder, str(i + 1), 'genome' + str(i + 1) + '.fsa'), 'w') as outfile: for seq in x: outfile.write('>{:}\n{:}\n'.format(seq[0], seq[1])) headers[seq[0]] = i + 1 # now parse tbl file and split in same way as fasta files with open(os.path.join(folder, 'genome.tbl'), 'r') as tblin: for contig in lib.readBlocks(tblin, '>Feature'): ID = contig[0].split(' ')[-1].rstrip() filenum = None if ID in headers: filenum = headers.get(ID) if filenum: with open( os.path.join(folder, str(filenum), 'genome' + str(filenum) + '.tbl'), 'a') as tblout: tblout.write(''.join(contig))