else: single_norm = None #setup reads and check if normalization worked norm_reads = (left_norm, right_norm, single_norm) lib.log.debug(norm_reads) for read in norm_reads: if read: if not os.path.isfile(read): lib.log.error("Read normalization failed, %s does not exist." % read) sys.exit(1) #now run Trinity with trimmomatic and read normalization trinity_transcripts = os.path.join(tmpdir, 'trinity.fasta') if not lib.checkannotations(trinity_transcripts): if args.trinity: shutil.copyfile(os.path.abspath(args.trinity), trinity_transcripts) else: #run trinity genome guided runTrinityGG(genome, norm_reads, trinity_transcripts) else: lib.log.info( "Existing Trinity results found: {:}".format(trinity_transcripts)) #clip polyA tails #polyAclip(trinity_tmp, trinity_transcripts) #run SeqClean to clip polyA tails and remove low quality seqs. cleanTranscripts = os.path.join(tmpdir, 'trinity.fasta.clean') runSeqClean(trinity_transcripts, tmpdir)
def runExonerate(input): s = input.split(':::') ProtID = s[0] ScaffID = s[1] ScaffStart = int(s[2]) ScaffEnd = int(s[3]) #get the protein model query = os.path.join(tmpdir, ProtID + '.' + str(os.getpid()) + '.fa') with open(query, 'w') as output: SeqIO.write(protein_dict[ProtID], output, 'fasta') #now get the genome region, use different variable names for SeqRecords to avoid collision scaffold = os.path.join( tmpdir, ScaffID + '.' + ProtID + '.' + str(ScaffStart) + '-' + str(ScaffEnd) + '.fa') with open(scaffold, 'w') as output2: with open(os.path.join(tmpdir, 'scaffolds', ScaffID + '.fa'), 'rU') as fullscaff: for header, Sequence in SimpleFastaParser(fullscaff): #grab a 3 kb cushion on either side of hit region, careful of scaffold ends start = ScaffStart - 3000 if start < 1: start = 1 end = ScaffEnd + 3000 if end > len(Sequence): end = len(Sequence) output2.write('>%s\n%s\n' % (header, Sequence[start:end])) exoname = ProtID + '.' + ScaffID + '__' + str(start) + '__' #check that input files are created and valid exonerate_out = os.path.join(tmpdir, 'exonerate.' + exoname + '.out') ryo = "AveragePercentIdentity: %pi\n" cmd = [ 'exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo, query, scaffold ] if lib.checkannotations(query) and lib.checkannotations(scaffold): #run exonerate, capture errors with open(exonerate_out, 'w') as output3: proc = subprocess.Popen(cmd, stdout=output3, stderr=subprocess.PIPE) stderr = proc.communicate() if 'WARNING' in stderr[1]: lib.log.debug('Error in input:{:}'.format(input)) lib.log.debug( '%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end)) os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query))) os.rename( scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold))) else: for y in [query, scaffold]: try: lib.SafeRemove(y) except OSError: lib.log.debug("Error removing %s" % (y)) #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes if lib.getSize(exonerate_out) < 500: os.remove(exonerate_out) else: lib.log.debug('Error in query or scaffold:{:}'.format(input)) lib.SafeRemove(query) lib.SafeRemove(scaffold)
#make sure logfiles directory is present, will need later if not os.path.isdir(os.path.join(outputdir, 'logfiles')): os.makedirs(os.path.join(outputdir, 'logfiles')) #get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here Proteins = os.path.abspath(Proteins) genbank = os.path.abspath(genbank) if 'phobius' in args.methods or 'all' in args.methods: #run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt') phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log') lib.log.info( "Predicting secreted and transmembrane proteins using Phobius") if not lib.checkannotations(phobius_out): if args.email: subprocess.call([ os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-e', str(args.email), '-l', phobiusLog ]) else: subprocess.call([ os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog ]) if 'interproscan' in args.methods or 'all' in args.methods: IPRCombined = os.path.join(outputdir, 'annotate_misc', 'iprscan.xml') #run interpro scan
else: organism = args.species if not args.isolate: isolate = '???' else: isolate = args.isolate ############################################################################ #start workflow here ProtCount = lib.countfasta(Proteins) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #run PFAM-A search lib.log.info("Running HMMer search of PFAM domains") pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt') if not lib.checkannotations(pfam_results): lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results) num_annotations = lib.line_count(pfam_results) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run SwissProt Blast search lib.log.info("Running Blastp search of UniProt DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt') if not lib.checkannotations(blast_out): lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out) num_annotations = lib.line_count(blast_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run MEROPS Blast search lib.log.info("Running Blastp search of MEROPS protease DB")
else: organism_name = organism organism_name = organism_name.replace(' ', '_') if args.output: outputname = args.output else: outputname = organism_name #create tmp folder to run tbl2asn from #make tmp folder tmp = outputname + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #now move files into proper location if not lib.checkannotations(args.fasta): print('FASTA genome file not found: {:}'.format(args.fasta)) sys.exit(1) if not lib.checkannotations(args.tbl): print('TBL annotations file not found: {:}'.format(args.tbl)) sys.exit(1) shutil.copyfile(args.fasta, os.path.join(tmp, 'genome.fsa')) shutil.copyfile(args.tbl, os.path.join(tmp, 'genome.tbl')) #now we can run tbl2asn if args.sbt: SBT = args.sbt else: SBT = os.path.join(parentdir, 'lib', 'test.sbt') discrep = outputname + '.discrepency.txt' version = 1
if args.cpus > len(scaffolds): num = len(scaffolds) else: num = args.cpus lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % (len(scaffolds), num)) lib.runMultiProgress(runAugustus, scaffolds, num) lib.log.debug("Augustus prediction is finished, now concatenating results") with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output: for file in scaffolds: file = os.path.join(tmpdir, file + '.augustus.gff3') with open(file) as input: output.write(input.read()) if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')): lib.log.debug('Augustus finished, now joining results') if lib.which_path('join_aug_pred.pl'): join_script = 'join_aug_pred.pl' else: join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl') cmd = '{:} < {:} > {:}'.format(join_script, os.path.join(tmpdir, 'augustus_all.gff3'), args.out) lib.log.debug(cmd) with open(args.out, 'w') as finalout: with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as infile: subprocess.call([join_script], stdin=infile, stdout=finalout)
for file in os.listdir(go_folder): if not file.startswith('associations'): file = os.path.join(go_folder, file) with open(file) as input: pop.write(input.read()) #now loop through each genome comparing to population for f in os.listdir(go_folder): if f.startswith('associations'): continue if f.startswith('population'): continue file = os.path.join(go_folder, f) base = f.replace('.txt', '') goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt') if not lib.checkannotations(goa_out): cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')] lib.runSubprocess2(cmd, '.', lib.log, goa_out) #load into pandas and write to html with open(os.path.join(args.out, 'go.html'), 'w') as output: pd.set_option('display.max_colwidth', -1) pd.options.mode.chained_assignment = None #turn off warning output.write(lib.HEADER) output.write(lib.GO) for f in os.listdir(os.path.join(args.out, 'go_enrichment')): if f.endswith('go.enrichment.txt'): file = os.path.join(args.out, 'go_enrichment', f) base = os.path.basename(file) name = base.split('.go_enrichment.txt')[0] #check goatools output, return is a tuple with True/False and header line #
if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)): lib.download_buscos(args.busco_db) ProtCount = lib.countfasta(args.input) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #convert to proteins and screen with busco lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) BUSCODB = os.path.join(parentdir, 'DB', args.busco_db) BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO2.py') cmd = [sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f'] lib.runSubprocess(cmd, '.', lib.log) #check that it ran correctly busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv') if not lib.checkannotations(busco_results): lib.log.error("BUSCO failed, check logfile") sys.exit(1) nameChange = {} with open(busco_results, 'rU') as input: for line in input: if line.startswith('#'): continue cols = line.split('\t') if cols[1] == 'Complete': if not cols[2] in nameChange: nameChange[cols[2]] = cols[0] else: lib.log.error("Duplicate ID found: %s %s. Removing from results" % (cols[2], cols[0])) del nameChange[cols[2]]
strain = '???' else: strain = args.strain ############################################################################ #start workflow here ProtCount = lib.countfasta(Proteins) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') if ProtCount < 1: lib.log.error("There are no gene models in this genbank file") sys.exit(1) #run PFAM-A search lib.log.info("Running HMMer search of PFAM domains") pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt') if not lib.checkannotations(pfam_results): lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results) num_annotations = lib.line_count(pfam_results) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run SwissProt Blast search lib.log.info("Running Blastp search of UniProt DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt') if not lib.checkannotations(blast_out): lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out) num_annotations = lib.line_count(blast_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run MEROPS Blast search lib.log.info("Running Blastp search of MEROPS protease DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.merops.txt')
#create tmpdir pid = os.getpid() tmpdir = 'mask_' + str(pid) os.makedirs(tmpdir) repeats = None #parse options which dictates how repeatmodeler/masker are run if not args.repeatmodeler_lib: #no fasta file given, so if not args.repeatmasker_species: #no species given, so run entire repeatmodler + repeat masker repeats = 'repeatmodeler-library.' + str(pid) + '.fasta' lib.RepeatModelMask(args.input, args.cpus, tmpdir, args.out, repeats, log_name) else: lib.RepeatMaskSpecies(args.input, args.repeatmasker_species, args.cpus, tmpdir, args.out, log_name) else: if lib.checkannotations(args.repeatmodeler_lib): lib.RepeatMask(args.input, args.repeatmodeler_lib, args.cpus, tmpdir, args.out, log_name) else: lib.log.error('ERROR: repeat library is not a valid file: {:}'.format( args.repeatmodeler_lib)) sys.exit(1) #output some stats on %reads masked. scaffolds = 0 maskedSize = 0 GenomeLength = 0 with open(args.out, 'rU') as input: for rec, Seq in SimpleFastaParser(input): scaffolds += 1 GenomeLength += len(Seq)
else: organism = args.species if not args.isolate: isolate = '???' else: isolate = args.isolate ############################################################################ #start workflow here ProtCount = lib.countfasta(Proteins) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #run PFAM-A search lib.log.info("Running HMMer search of PFAM domains") pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt') if not lib.checkannotations(pfam_results): lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results) num_annotations = lib.line_count(pfam_results) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run SwissProt Blast search lib.log.info("Running Blastp search of UniProt DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt') if not lib.checkannotations(blast_out): lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out) num_annotations = lib.line_count(blast_out) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run MEROPS Blast search lib.log.info("Running Blastp search of MEROPS protease DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.merops.txt') if not lib.checkannotations(blast_out): lib.MEROPSBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out)
single_norm = None #setup reads and check if normalization worked norm_reads = (left_norm, right_norm, single_norm) lib.log.debug(norm_reads) for read in norm_reads: if read: if not os.path.isfile(read): lib.log.error("Read normalization failed, %s does not exist." % read) sys.exit(1) #now run Trinity with trimmomatic and read normalization trinity_transcripts = os.path.join(tmpdir, 'trinity.fasta') trinity_tmp = os.path.join(tmpdir, 'trinity.tmp') if not lib.checkannotations(trinity_tmp): if args.trinity: shutil.copyfile(os.path.abspath(args.trinity), trinity_tmp) else: #run trinity genome guided runTrinityGG(genome, norm_reads, trinity_tmp) else: lib.log.info("Existing Trinity results found {:}".format(trinity_tmp)) #clip polyA tails polyAclip(trinity_tmp, trinity_transcripts) if not lib.checkannotations(trinity_tmp): lib.log.error("TRINITY step failed, check logfile, exiting") sys.exit(1)