def runTrimmomaticSE(reads): ''' function is wrapper for Trinity trimmomatic ''' #create tmpdir folder = os.path.join(tmpdir, 'trimmomatic') if not os.path.isdir(folder): os.makedirs(folder) lib.log.info("Adapter and Quality trimming SE reads with Trimmomatic") output = os.path.join(folder, 'trimmed_single.fastq') TRIMMOMATIC_DIR = os.path.join(TRINITY, 'trinity-plugins', 'Trimmomatic-0.36') cmd = [ 'java', '-jar', os.path.join(TRIMMOMATIC_DIR, 'trimmomatic.jar'), 'SE', '-threads', str(args.cpus), '-phred33', reads, output, 'ILLUMINACLIP:' + os.path.join(TRIMMOMATIC_DIR, 'adapters', 'TruSeq3-SE.fa') + ':2:30:10', 'SLIDINGWINDOW:4:5', 'LEADING:5', 'TRAILING:5', 'MINLEN:25' ] lib.runSubprocess(cmd, '.', lib.log) Fzip_inplace(output, args.cpus) trim_single = os.path.join(folder, 'trimmed_single.fastq.gz') return trim_single
def runTrimmomaticPE(left, right): ''' function is wrapper for Trinity trimmomatic ''' #create tmpdir folder = os.path.join(tmpdir, 'trimmomatic') if not os.path.isdir(folder): os.makedirs(folder) lib.log.info("Adapter and Quality trimming PE reads with Trimmomatic") left_paired = os.path.join(folder, 'trimmed_left.fastq') left_single = os.path.join(folder, 'trimmed_left.unpaired.fastq') right_paired = os.path.join(folder, 'trimmed_right.fastq') right_single = os.path.join(folder, 'trimmed_right.unpaired.fastq') TRIMMOMATIC_DIR = os.path.join(TRINITY, 'trinity-plugins', 'Trimmomatic-0.36') cmd = ['java', '-jar', os.path.join(TRIMMOMATIC_DIR, 'trimmomatic.jar'), 'PE', '-threads', str(args.cpus), '-phred33', left, right, left_paired, left_single, right_paired, right_single, 'ILLUMINACLIP:'+os.path.join(TRIMMOMATIC_DIR,'adapters','TruSeq3-PE.fa')+':2:30:10', 'SLIDINGWINDOW:4:5', 'LEADING:5', 'TRAILING:5', 'MINLEN:25'] lib.runSubprocess(cmd, '.', lib.log) for x in [left_paired, left_single, right_paired, right_single]: Fzip_inplace(x, args.cpus) trim_left = os.path.join(folder, 'trimmed_left.fastq.gz') trim_right = os.path.join(folder, 'trimmed_right.fastq.gz') return trim_left, trim_right
def pfamDB(info, force=False): hmm = os.path.join(FUNDB, 'Pfam-A.hmm') familyinfo = os.path.join(FUNDB, 'Pfam-A.clans.tsv') versionfile = os.path.join(FUNDB, 'Pfam.version') if os.path.isfile(hmm) and args.update and not force: if check4newDB('pfam-log', info): force = True if not os.path.isfile(hmm) or force: lib.log.info('Downloading Pfam database') download(lib.DBURL.get('pfam'), hmm + '.gz') subprocess.call(['gunzip', '-f', 'Pfam-A.hmm.gz'], cwd=os.path.join(FUNDB)) download(lib.DBURL.get('pfam-tsv'), familyinfo + '.gz') subprocess.call(['gunzip', '-f', 'Pfam-A.clans.tsv.gz'], cwd=os.path.join(FUNDB)) download(lib.DBURL.get('pfam-log'), versionfile + '.gz') md5 = calcmd5(versionfile + '.gz') subprocess.call(['gunzip', '-f', 'Pfam.version.gz'], cwd=os.path.join(FUNDB)) num_records = 0 pfamdate = '' pfamvers = '' with open(versionfile, 'rU') as input: for line in input: if line.startswith('Pfam release'): pfamvers = line.split(': ')[-1].rstrip() if line.startswith('Pfam-A families'): num_records = int(line.split(': ')[-1].rstrip()) if line.startswith('Date'): pfamdate = line.split(': ')[-1].rstrip() lib.log.info('Creating Pfam HMM database') cmd = ['hmmpress', 'Pfam-A.hmm'] lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log) info['pfam'] = ('hmmer3', hmm, pfamvers, pfamdate, num_records, md5) type, name, version, date, records, checksum = info.get('pfam') lib.log.info('Pfam Database: version={:} date={:} records={:,}'.format( version, date, records))
def dbCANDB(info, force=False): hmm = os.path.join(FUNDB, 'dbCAN.hmm') familyinfo = os.path.join(FUNDB, 'dbCAN-fam-HMMs.txt') versionfile = os.path.join(FUNDB, 'dbCAN.changelog.txt') if os.path.isfile(hmm) and args.update and not force: if check4newDB('dbCAN', info): force = True if not os.path.isfile(hmm) or force: lib.log.info('Downloading dbCAN database') download(lib.DBURL.get('dbCAN'), os.path.join(FUNDB, 'dbCAN.tmp')) md5 = calcmd5(os.path.join(FUNDB, 'dbCAN.tmp')) download(lib.DBURL.get('dbCAN-tsv'), familyinfo) download(lib.DBURL.get('dbCAN-log'), versionfile) num_records = 0 dbdate = '' dbvers = '' with open(hmm, 'w') as out: with open(os.path.join(FUNDB, 'dbCAN.tmp'), 'rU') as input: for line in input: if line.startswith('NAME'): num_records += 1 line = line.replace('.hmm\n', '\n') out.write(line) with open(versionfile, 'rU') as infile: head = [next(infile) for x in xrange(2)] dbdate = head[1].replace('# ', '').rstrip() dbvers = head[0].split(' ')[-1].rstrip() dbdate = datetime.datetime.strptime(dbdate, "%m/%d/%Y").strftime("%Y-%m-%d") lib.log.info('Creating dbCAN HMM database') cmd = ['hmmpress', 'dbCAN.hmm'] lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log) info['dbCAN'] = ('hmmer3', hmm, dbvers, dbdate, num_records, md5) os.remove(os.path.join(FUNDB, 'dbCAN.tmp')) type, name, version, date, records, checksum = info.get('dbCAN') lib.log.info('dbCAN Database: version={:} date={:} records={:,}'.format( version, date, records))
def runtblastn(input, query, cpus, output, maxhits): #start by formatting blast db/dustmasker filtered format cmd = [ 'dustmasker', '-in', input, '-infmt', 'fasta', '-parse_seqids', '-outfmt', 'maskinfo_asn1_bin', '-out', 'genome_dust.asnb' ] lib.runSubprocess(cmd, output, lib.log) cmd = [ 'makeblastdb', '-in', input, '-dbtype', 'nucl', '-parse_seqids', '-mask_data', 'genome_dust.asnb', '-out', 'genome' ] lib.runSubprocess(cmd, output, lib.log) cmd = [ 'tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', str(maxhits), '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab' ] lib.runSubprocess(cmd, output, lib.log)
logfile = input + '.log' with open(logfile, 'w') as output: subprocess.call([perl, Execute, input], stdout=output, stderr=output) def safe_run(*args, **kwargs): """Call run(), catch exceptions.""" try: worker(*args, **kwargs) except Exception as e: print("error: %s run(*%r, **%r)" % (e, args, kwargs)) #split partitions lib.log.info("Setting up EVM partitions") lib.runSubprocess(cmd1, tmpdir, lib.log) #subprocess.call(cmd1, cwd = tmpdir, stdout = FNULL, stderr = FNULL) #check output lib.checkinputs(os.path.join(tmpdir, 'partitions_list.out')) #generate commands lib.log.info("Generating EVM command list") commands = os.path.join(tmpdir, 'commands.list') with open(commands, 'w') as output: subprocess.call(cmd2, cwd=tmpdir, stdout=output, stderr=FNULL) #count total lines num_lines = sum(1 for line in open(commands)) #strange thing happens if you try to run with more cpus than commands if num_lines < cpus: x = num_lines
def runKallisto(input, fasta, readTuple, stranded, cpus, output): ''' function takes GFF3 output from PASA compare, extracts transcripts, and then calculates TPM using Kallisto to idenitfy the best scoring gene model for each locus, the left and right these should be the adapter cleaned non-normalized Illumina reads ''' lib.log.info( "Using Kallisto TPM data to determine which PASA gene models to select at each locus" ) #convert GFF to transcripts folder = os.path.join(tmpdir, 'getBestModel') if not os.path.exists(folder): os.makedirs( folder ) # handle already existing folder okay? could also delete it PASAtranscripts = os.path.join(folder, 'transcripts.fa') cmd = [ os.path.join(PASA, 'misc_utilities', 'gff3_file_to_proteins.pl'), input, fasta, 'cDNA' ] lib.log.info("Building Kallisto index") lib.runSubprocess2(cmd, '.', lib.log, PASAtranscripts) #generate kallisto index cmd = [ 'kallisto', 'index', '-i', os.path.join(folder, 'bestModel'), PASAtranscripts ] lib.runSubprocess(cmd, '.', lib.log) #use kallisto to map reads to index #base command cmd = [ 'kallisto', 'quant', '-i', os.path.join(folder, 'bestModel'), '-o', os.path.join(folder, 'kallisto'), '--plaintext', '-t', str(cpus) ] #parse the strand information if stranded == 'RF': strandcmd = ['--rf-stranded'] elif stranded == 'FR': strandcmd = ['--fr-stranded'] else: strandcmd = [] #adapt command for input, i.e. single or PE ends -> what do you do if you have both? if readTuple[2] and not readTuple[0] and not readTuple[ 1]: #single, not just using estimated lengths and SD, I think this is okay? can make this an option otherwise cmd = cmd + ['--single', '-l', '200', '-s', '20', readTuple[2]] elif readTuple[0] and readTuple[1]: cmd = cmd + strandcmd + [readTuple[0], readTuple[1]] lib.log.info("Mapping reads using pseudoalignment in Kallisto") lib.runSubprocess(cmd, '.', lib.log) #modify kallisto ouput to map gene names to each mRNA ID so you know what locus they have come from mRNADict = {} #since mRNA is unique, parse the transcript file which has mRNAID geneID in header with open(PASAtranscripts, 'rU') as transin: for line in transin: if line.startswith('>'): line = line.rstrip() line = line.replace('>', '') cols = line.split(' ') mRNAID = cols[0] geneID = cols[1] location = cols[-1] if not mRNAID in mRNADict: mRNADict[mRNAID] = (geneID, location) #some PASA models can have incomplete CDS and are wrong, get list of incompletes to ignore list ignore = [] with open(input, 'rU') as infile: for line in infile: if line.startswith('#PROT'): if line.endswith('\t\n'): ID = line.split(' ')[1] ignore.append(ID) if len(ignore) > 0: lib.log.debug("Ignoring %i incomplete PASA models: %s" % (len(ignore), ','.join(ignore))) #now make new tsv file with #mRNAID geneID location TPM with open(output, 'w') as outfile: outfile.write("#mRNA-ID\tgene-ID\tLocation\tTPM\n") with open(os.path.join(folder, 'kallisto', 'abundance.tsv'), 'rU') as infile: for line in infile: if line.startswith('targed_id'): continue line = line.rstrip() cols = line.split('\t') if cols[0] in ignore: continue if cols[0] in mRNADict: geneHit = mRNADict.get(cols[0]) geneID = geneHit[0] location = geneHit[1] outfile.write('%s\t%s\t%s\t%s\n' % (cols[0], geneID, location, cols[4]))
def runPASAtrain(genome, transcripts, cleaned_transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') if args.pasa_db == 'sqlite': pasaDBname_path = os.path.abspath(os.path.join(folder, pasaDBname)) else: pasaDBname_path = pasaDBname with open(alignConfig, 'w') as config1: with open( os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__DATABASE__>', pasaDBname_path) line = line.replace('<__MYSQLDB__>', pasaDBname_path) config1.write(line) if not os.path.isfile( os.path.join(folder, pasaDBname + '.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info( "Running PASA alignment step using {:,} transcripts".format( lib.countfasta(cleaned_transcripts))) cmd = [ LAUNCHPASA, '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-T', '-t', os.path.abspath(cleaned_transcripts), '-u', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--ALT_SPLICE', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus) ] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found: {:}'.format( os.path.join(folder, pasaDBname + '.assemblies.fasta'))) #generate TSV gene-transcripts Loci = [] numTranscripts = 0 with open(os.path.join(folder, 'pasa.gene2transcripts.tsv'), 'w') as gene2transcripts: with open( os.path.join(folder, pasaDBname + '.pasa_assemblies_described.txt'), 'rU') as description: for line in description: if not line.startswith('#'): cols = line.split('\t') gene2transcripts.write('g_%s\t%s\n' % (cols[1], cols[2])) numTranscripts += 1 if not cols[1] in Loci: Loci.append(cols[1]) lib.log.info("PASA assigned {:,} transcipts to {:,} loci (genes)".format( numTranscripts, len(Loci))) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join( folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3') cmd = [ os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3' ] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output) lib.log.info( 'PASA finished. PASAweb accessible via: localhost:port/cgi-bin/index.cgi?db=%s' % pasaDBname_path)
def removeAntiSense(input, readTuple, output): ''' function will map reads to the input transcripts, determine strandedness, and then filter out transcripts that were assembled in antisense orientation. idea here is that the antisense transcripts, while potentially valid, aren't going to help update the gene models and perhaps could hurt the annotation effort? ''' lib.log.info("Running anti-sense filtering of Trinity transcripts") bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads aligner = choose_aligner() if aligner == 'hisat2': bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Hisat2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'hisat2-build', input, os.path.join(tmpdir, 'hisat2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the aligner lib.log.info("Aligning reads to trinity transcripts with Hisat2") hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '-k', '50', '--max-intronlen', str(args.max_intronlen), '-x', os.path.join(tmpdir, 'hisat2.transcripts') ] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'bowtie2': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'bowtie2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Bowtie2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'bowtie2-build', input, os.path.join(tmpdir, 'bowtie2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with Bowtie2") bowtie2cmd = [ 'bowtie2', '-p', str(args.cpus), '-k', '50', '--local', '--no-unal', '-x', os.path.join(tmpdir, 'bowtie2.transcripts') ] if readTuple[2]: bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: bowtie2cmd = bowtie2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(bowtie2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'rapmap': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building RapMap index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'rapmap', 'quasiindex', '-t', input, '-i', os.path.join(tmpdir, 'rapmap_index') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with RapMap") rapmapcmd = [ 'rapmap', 'quasimap', '-t', str(args.cpus), '-i', os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(rapmapcmd), str(bamthreads), bowtie2bam ] lib.runSubprocess(cmd, '.', lib.log) #now run Trinity examine strandeness tool lib.log.info("Examining strand specificity") cmd = [ os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'), bowtie2bam, os.path.join(tmpdir, 'strand_specific') ] lib.runSubprocess(cmd, '.', lib.log) #parse output dat file and get list of transcripts to remove removeList = [] with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile: for line in infile: line = line.replace('\n', '') if line.startswith('#'): continue cols = line.split('\t') if args.stranded == 'RF': #then we want to keep negative ratios in cols[4] if not cols[4].startswith('-'): removeList.append(cols[0]) elif args.stranded == 'FR': #keep + values if cols[4].startswith('-'): removeList.append(cols[0]) #now parse the input fasta file removing records in list with open(output, 'w') as outfile: with open(input, 'rU') as infile: for record in SeqIO.parse(infile, 'fasta'): if not record.id in removeList: outfile.write(">%s\n%s\n" % (record.description, str(record.seq))) lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
def runTrinityGG(genome, readTuple, output): ''' function will run genome guided Trinity. First step will be to run hisat2 to align reads to the genome, then pass that BAM file to Trinity to generate assemblies ''' #build hisat2 index, using exons and splice sites lib.log.info("Starting Trinity genome guided") lib.log.info("Building Hisat2 genome index") cmd = ['hisat2-build', genome, os.path.join(tmpdir, 'hisat2.genome')] lib.runSubprocess4(cmd, '.', lib.log) #align reads using hisat2 lib.log.info("Aligning reads to genome using Hisat2") hisat2bam = os.path.join(tmpdir, 'hisat2.coordSorted.bam') #use bash wrapper for samtools piping for SAM -> BAM -> sortedBAM bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads if args.stranded != 'no' and not readTuple[2]: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome'), '--rna-strandness', args.stranded ] else: hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '--max-intronlen', str(args.max_intronlen), '--dta', '-x', os.path.join(tmpdir, 'hisat2.genome') ] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + ['-1', readTuple[0], '-2', readTuple[1]] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), hisat2bam ] lib.runSubprocess(cmd, '.', lib.log) #now launch Trinity genome guided TrinityLog = os.path.join(tmpdir, 'Trinity-gg.log') lib.log.info("Running genome-guided Trinity, logfile: %s" % TrinityLog) lib.log.info( "Clustering of reads from BAM and preparing assembly commands") jaccard_clip = [] if args.jaccard_clip: jaccard_clip = ['--jaccard_clip'] if args.stranded != 'no' and not readTuple[2]: cmd = [ 'Trinity', '--SS_lib_type', args.stranded, '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] else: cmd = [ 'Trinity', '--no_distributed_trinity_exec', '--genome_guided_bam', hisat2bam, '--genome_guided_max_intron', str(args.max_intronlen), '--CPU', str(args.cpus), '--max_memory', args.memory, '--output', os.path.join(tmpdir, 'trinity_gg') ] cmd = cmd + jaccard_clip lib.runSubprocess2(cmd, '.', lib.log, TrinityLog) commands = os.path.join(tmpdir, 'trinity_gg', 'trinity_GG.cmds') #this will create all the Trinity commands, will now run these in parallel using multiprocessing in Python (seems to be much faster than Parafly on my system) file_list = [] with open(commands, 'rU') as cmdFile: for line in cmdFile: line = line.replace('\n', '') line = line.replace( '--no_distributed_trinity_exec', '') #don't think this should be appended to every command.... line = line.replace('"', '') #don't need these double quotes file_list.append(line) lib.log.info("Assembling " + "{0:,}".format(len(file_list)) + " Trinity clusters using %i CPUs" % (args.cpus - 1)) lib.runMultiProgress(safe_run, file_list, args.cpus - 1) #collected output files and clean outputfiles = os.path.join(tmpdir, 'trinity_gg', 'trinity_output_files.txt') with open(outputfiles, 'w') as fileout: for filename in find_files(os.path.join(tmpdir, 'trinity_gg'), '*inity.fasta'): fileout.write('%s\n' % filename) #now grab them all using Trinity script cmd = [ os.path.join(TRINITY, 'util', 'support_scripts', 'GG_partitioned_trinity_aggregator.pl'), 'Trinity_GG' ] lib.runSubprocess5(cmd, '.', lib.log, outputfiles, output)
sys.exit(1) #check EggNog database, download if necessary. if not args.eggnog_db in lib.Nogs: lib.log.error("%s is not a valid EggNog group, options are:\n%s" % (args.eggnog_db, ', '.join(lib.Nogs))) sys.exit(1) if not os.path.isfile( os.path.join(parentdir, 'DB', args.eggnog_db + '_4.5.hmm')): lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db) cmd = [ os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB') ] lib.runSubprocess(cmd, '.', lib.log) if not os.path.isfile( os.path.join(parentdir, 'DB', args.eggnog_db + '_4.5.hmm')): lib.log.error("Downloading failed, exiting") sys.exit(1) else: lib.log.error("%s downloaded and formatted, moving on." % args.eggnog_db) #check buscos, download if necessary if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)): lib.download_buscos(args.busco_db) #need to do some checks here of the input genbank = '' Scaffolds = ''
scoCount = 0 if len(args.input) > 1: if not args.proteinortho: lib.log.info("Running orthologous clustering tool, ProteinOrtho5. This may take awhile...") #setup protein ortho inputs, some are a bit strange in the sense that they use equals signs #generate list of files based on input order for consistency filelist = [] for i in scinames: name = i+'.faa' filelist.append(name) #setup command cmd = ['proteinortho5.pl', '-project=funannotate', '-synteny', '-cpus='+str(args.cpus), '-singles', '-selfblast'] cmd2 = cmd + filelist if not os.path.isfile(os.path.join(args.out, 'protortho', 'funannotate.poff')): lib.runSubprocess(cmd2, protortho, lib.log) else: shutil.copyfile(args.proteinortho, os.path.join(args.out, 'protortho', 'funannotate.poff')) #open poff in pandas to parse "easier" for stats, orthologs, etc df = pd.read_csv(os.path.join(args.out, 'protortho', 'funannotate.poff'), sep='\t', header=0) df.rename(columns=lambda x: x.replace('.faa', ''), inplace=True) #reorder table to it matches up with busco list of dicts newhead = [df.columns.values[0], df.columns.values[1], df.columns.values[2]] newhead += scinames try: df = df[newhead] except KeyError: #means they were not found, likely need to then drop isolate name (I hope that catches them all) newhead = [i.rsplit('_',1)[0] for i in newhead] for x in newhead: if not x in df.columns.values:
def runSeqClean(input, folder): ''' wrapper to run PASA seqclean on Trinity transcripts ''' cmd = [os.path.join(PASA, 'bin', 'seqclean'), os.path.basename(input)] lib.runSubprocess(cmd, folder, lib.log)
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #create pasa and transdecoder logfiles pasa_log = os.path.join(folder, 'pasa.log') transdecoder_log = os.path.join(folder, 'transdecoder.log') #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') with open(alignConfig, 'w') as config1: with open( os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__MYSQLDB__>', pasaDBname) config1.write(line) if not os.path.isfile( os.path.join(folder, pasaDBname + '.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info( "Running PASA alignment step using {:,} transcripts".format( lib.countfasta(transcripts))) cmd = [ os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus) ] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found {:}'.format( os.path.join(folder, pasaDBname + '.assemblies.fasta'))) #generate TSV gene-transcripts numLoci = getPASAtranscripts2genes( os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'), os.path.join(folder, 'pasa.gene2transcripts.tsv')) numTranscripts = lib.countfasta( os.path.join(folder, pasaDBname + '.assemblies.fasta')) lib.log.info( "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold". format(numTranscripts, numLoci, args.pasa_alignment_overlap)) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join( folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3') if lib.which('TransDecoder.LongOrfs') and lib.which( 'TransDecoder.Predict'): cmd = [ 'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta', '--gene_trans_map', 'pasa.gene2transcripts.tsv' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ 'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta', '--single_best_only' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ os.path.join(PASA, 'pasa-plugins', 'transdecoder', 'cdna_alignment_orf_to_genome_orf.pl'), pasaDBname + '.assemblies.fasta.transdecoder.gff3', pasaDBname + '.pasa_assemblies.gff3', pasaDBname + '.assemblies.fasta' ] lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff) else: cmd = [ os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3' ] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output)