def meropsDB(info, force=False): fasta = os.path.join(FUNDB, 'merops_scan.lib') filtered = os.path.join(FUNDB, 'merops.formatted.fa') database = os.path.join(FUNDB, 'merops.dmnd') if os.path.isfile(fasta) and args.update and not force: if check4newDB('merops', info): force=True if not os.path.isfile(fasta) or force: lib.log.info('Downloading Merops database') download(lib.DBURL.get('merops'), fasta) md5 = calcmd5(fasta) #reformat fasta headers with open(filtered, 'w') as filtout: with open(fasta, 'rU') as infile: for line in infile: if line.startswith('>'): line = line.rstrip() ID = line.split()[0] family = line.split('#')[1] filtout.write('{:} {:}\n'.format(ID, family)) else: filtout.write(line) lib.log.info('Building diamond database') cmd = ['diamond', 'makedb', '--in', 'merops.formatted.fa', '--db', 'merops'] lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log) num_records = lib.countfasta(filtered) info['merops'] = ('diamond', database, '12.0', '2017-10-04', num_records, md5) type, name, version, date, records, checksum = info.get('merops') lib.log.info('MEROPS Database: version={:} date={:} records={:,}'.format(version, date, records))
def uniprotDB(info, force=False): ''' download swissprot/uniprot database, format for diamond, and output date of database ''' fasta = os.path.join(FUNDB, 'uniprot_sprot.fasta') database = os.path.join(FUNDB, 'uniprot.dmnd') versionfile = os.path.join(FUNDB, 'uniprot.release-date.txt') if os.path.isfile(fasta) and args.update and not force: if check4newDB('uniprot-release', info): force=True if not os.path.isfile(fasta) or force: lib.log.info('Downloading UniProtKB/SwissProt database') download(lib.DBURL.get('uniprot'), fasta+'.gz') subprocess.call(['gunzip', '-f', 'uniprot_sprot.fasta.gz'], cwd=os.path.join(FUNDB)) download(lib.DBURL.get('uniprot-release'), versionfile) md5 = calcmd5(versionfile) unidate = '' univers = '' with open(versionfile, 'rU') as infile: for line in infile: if line.startswith('UniProtKB/Swiss-Prot Release'): rest, datepart = line.split(' of ') unidate = datetime.datetime.strptime(datepart.rstrip(), "%d-%b-%Y").strftime("%Y-%m-%d") univers = rest.split(' ')[-1] lib.log.info('Building diamond database') cmd = ['diamond', 'makedb', '--in', 'uniprot_sprot.fasta', '--db', 'uniprot'] lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log) num_records = lib.countfasta(os.path.join(FUNDB, 'uniprot_sprot.fasta')) info['uniprot'] = ('diamond', database, univers, unidate, num_records, md5) type, name, version, date, records, checksum = info.get('uniprot') lib.log.info('UniProtKB Database: version={:} date={:} records={:,}'.format(version, date, records))
def repeatDB(info, force=False): fasta = os.path.join(FUNDB, 'funannotate.repeat.proteins.fa') filtered = os.path.join(FUNDB, 'funannotate.repeats.reformat.fa') database = os.path.join(FUNDB, 'repeats.dmnd') if os.path.isfile(fasta) and args.update and not force: if check4newDB('repeats', info): force=True if not os.path.isfile(fasta) or force: lib.log.info('Downloading Repeat database') download(lib.DBURL.get('repeats'), fasta+'.tar.gz') md5 = calcmd5(fasta+'.tar.gz') subprocess.call(['tar', '-zxf', 'funannotate.repeat.proteins.fa.tar.gz'], cwd=os.path.join(FUNDB)) with open(filtered, 'w') as out: with open(fasta, 'rU') as infile: for line in infile: #this repeat fasta file has messed up headers.... if line.startswith('>'): line = line.replace('#', '_') line = line.replace('/', '-') line = line.replace('&', '') out.write(line) lib.log.info('Building diamond database') cmd = ['diamond', 'makedb', '--in', 'funannotate.repeats.reformat.fa', '--db', 'repeats', '-parse_seqids'] lib.runSubprocess(cmd, os.path.join(FUNDB), lib.log) num_records = lib.countfasta(filtered) info['repeats'] = ('diamond', database, '1.0', today, num_records, md5) type, name, version, date, records, checksum = info.get('repeats') lib.log.info('Repeat Database: version={:} date={:} records={:,}'.format(version, date, records))
def runPASAtrain(genome, transcripts, cleaned_transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') if args.pasa_db == 'sqlite': pasaDBname_path = os.path.abspath(os.path.join(folder, pasaDBname)) else: pasaDBname_path = pasaDBname with open(alignConfig, 'w') as config1: with open(os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__DATABASE__>', pasaDBname_path) line = line.replace('<__MYSQLDB__>', pasaDBname_path) config1.write(line) if not os.path.isfile(os.path.join(folder, pasaDBname+'.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info("Running PASA alignment step using {:,} transcripts".format(lib.countfasta(cleaned_transcripts))) cmd = [LAUNCHPASA, '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-T','-t', os.path.abspath(cleaned_transcripts), '-u', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--ALT_SPLICE', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus)] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found: {:}'.format(os.path.join(folder, pasaDBname+'.assemblies.fasta'))) #generate TSV gene-transcripts Loci = [] numTranscripts = 0 with open(os.path.join(folder, 'pasa.gene2transcripts.tsv'), 'w') as gene2transcripts: with open(os.path.join(folder, pasaDBname+'.pasa_assemblies_described.txt'), 'rU') as description: for line in description: if not line.startswith('#'): cols = line.split('\t') gene2transcripts.write('g_%s\t%s\n' % (cols[1], cols[2])) numTranscripts += 1 if not cols[1] in Loci: Loci.append(cols[1]) lib.log.info("PASA assigned {:,} transcipts to {:,} loci (genes)".format(numTranscripts, len(Loci))) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join(folder, pasaDBname+'.assemblies.fasta.transdecoder.genome.gff3') cmd = [os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname+'.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname+'.pasa_assemblies.gff3'] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output) lib.log.info('PASA finished. PASAweb accessible via: localhost:port/cgi-bin/index.cgi?db=%s' % pasaDBname_path)
def mibigDB(info, force=False): fasta = os.path.join(args.database, 'mibig.fa') database = os.path.join(args.database, 'mibig.dmnd') if not os.path.isfile(fasta) or force: lib.log.info('Downloading MiBIG Secondary Metabolism database') download(URL.get('mibig'), fasta) version = os.path.basename(URL.get('mibig')).split('_')[-1].replace( '.fasta', '') lib.log.info('Building diamond database') cmd = ['diamond', 'makedb', '--in', 'mibig.fa', '--db', 'mibig'] lib.runSubprocess(cmd, os.path.join(args.database), lib.log) num_records = lib.countfasta(fasta) info['mibig'] = ('diamond', database, version, today, num_records) type, name, version, date, records = info.get('mibig') lib.log.info('MiBIG Database: version={:} date={:} records={:,}'.format( version, date, records))
os.path.basename(query))) os.rename(scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold))) else: for y in [query, scaffold]: try: os.remove(y) except OSError: lib.log.debug("Error removing %s" % (y)) #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes if lib.getSize(exonerate_out) < 500: os.remove(exonerate_out) #count number of proteins to look for total = lib.countfasta(args.proteins) lib.log.info('Using {0:,}'.format(total) + ' proteins as queries') #make tmpdir tmpdir = 'p2g_' + str(os.getpid()) if not os.path.isdir(tmpdir): os.makedirs(tmpdir) os.makedirs(os.path.join(tmpdir, 'failed')) os.makedirs(os.path.join(tmpdir, 'scaffolds')) if args.filter == 'tblastn': lib.log.debug("BLAST v%s; Exonerate v%s" % (blast_version, exo_version)) #check for tblastn input if args.tblastn: lib.log.info("Using pre-calculated tBLASTN result") BlastResult = args.tblastn
def removeAntiSense(input, readTuple, output): ''' function will map reads to the input transcripts, determine strandedness, and then filter out transcripts that were assembled in antisense orientation. idea here is that the antisense transcripts, while potentially valid, aren't going to help update the gene models and perhaps could hurt the annotation effort? ''' lib.log.info("Running anti-sense filtering of Trinity transcripts") bamthreads = ( args.cpus + 2 // 2) // 2 #use half number of threads for bam compression threads aligner = choose_aligner() if aligner == 'hisat2': bowtie2bam = os.path.join(tmpdir, 'hisat2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Hisat2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'hisat2-build', input, os.path.join(tmpdir, 'hisat2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the aligner lib.log.info("Aligning reads to trinity transcripts with Hisat2") hisat2cmd = [ 'hisat2', '-p', str(args.cpus), '-k', '50', '--max-intronlen', str(args.max_intronlen), '-x', os.path.join(tmpdir, 'hisat2.transcripts') ] if readTuple[2]: hisat2cmd = hisat2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: hisat2cmd = hisat2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(hisat2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'bowtie2': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'bowtie2.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building Bowtie2 index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'bowtie2-build', input, os.path.join(tmpdir, 'bowtie2.transcripts') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with Bowtie2") bowtie2cmd = [ 'bowtie2', '-p', str(args.cpus), '-k', '50', '--local', '--no-unal', '-x', os.path.join(tmpdir, 'bowtie2.transcripts') ] if readTuple[2]: bowtie2cmd = bowtie2cmd + ['-U', readTuple[2]] if readTuple[0] and readTuple[1]: bowtie2cmd = bowtie2cmd + [ '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(bowtie2cmd), str(bamthreads), bowtie2bam ] lib.runSubprocess4(cmd, '.', lib.log) elif aligner == 'rapmap': #using bowtie2 bowtie2bam = os.path.join(tmpdir, 'rapmap.transcripts.coordSorted.bam') if not os.path.isfile(bowtie2bam): lib.log.info("Building RapMap index of " + "{0:,}".format(lib.countfasta(input)) + " trinity transcripts") cmd = [ 'rapmap', 'quasiindex', '-t', input, '-i', os.path.join(tmpdir, 'rapmap_index') ] lib.runSubprocess4(cmd, '.', lib.log) #now launch the subprocess commands in order lib.log.info("Aligning reads to trinity transcripts with RapMap") rapmapcmd = [ 'rapmap', 'quasimap', '-t', str(args.cpus), '-i', os.path.join(tmpdir, 'rapmap_index'), '-1', readTuple[0], '-2', readTuple[1] ] cmd = [ os.path.join(parentdir, 'util', 'sam2bam.sh'), " ".join(rapmapcmd), str(bamthreads), bowtie2bam ] lib.runSubprocess(cmd, '.', lib.log) #now run Trinity examine strandeness tool lib.log.info("Examining strand specificity") cmd = [ os.path.join(TRINITY, 'util', 'misc', 'examine_strand_specificity.pl'), bowtie2bam, os.path.join(tmpdir, 'strand_specific') ] lib.runSubprocess(cmd, '.', lib.log) #parse output dat file and get list of transcripts to remove removeList = [] with open(os.path.join(tmpdir, 'strand_specific.dat'), 'rU') as infile: for line in infile: line = line.replace('\n', '') if line.startswith('#'): continue cols = line.split('\t') if args.stranded == 'RF': #then we want to keep negative ratios in cols[4] if not cols[4].startswith('-'): removeList.append(cols[0]) elif args.stranded == 'FR': #keep + values if cols[4].startswith('-'): removeList.append(cols[0]) #now parse the input fasta file removing records in list with open(output, 'w') as outfile: with open(input, 'rU') as infile: for record in SeqIO.parse(infile, 'fasta'): if not record.id in removeList: outfile.write(">%s\n%s\n" % (record.description, str(record.seq))) lib.log.info("Removing %i antisense transcripts" % (len(removeList)))
break else: lib.log.error( "No species name given will cause problems downstream, please pass a name to -s,--species" ) sys.exit(1) else: organism = args.species if not args.isolate: isolate = '???' else: isolate = args.isolate ############################################################################ #start workflow here ProtCount = lib.countfasta(Proteins) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #run PFAM-A search lib.log.info("Running HMMer search of PFAM domains") pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt') if not lib.checkannotations(pfam_results): lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results) num_annotations = lib.line_count(pfam_results) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run SwissProt Blast search lib.log.info("Running Blastp search of UniProt DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt') if not lib.checkannotations(blast_out):
lib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv)+'\n' lib.log.debug(cmd_args) print "-------------------------------------------------------" lib.SystemInfo() #get version of funannotate version = lib.get_version() lib.log.info("Running %s" % version) #check buscos, download if necessary if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)): lib.download_buscos(args.busco_db) ProtCount = lib.countfasta(args.input) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #convert to proteins and screen with busco lib.log.info("Looking for BUSCO models with %s DB" % args.busco_db) BUSCODB = os.path.join(parentdir, 'DB', args.busco_db) BUSCO = os.path.join(parentdir, 'util', 'funannotate-BUSCO2.py') cmd = [sys.executable, BUSCO, '-i', os.path.abspath(args.input), '-m', 'proteins', '--lineage', BUSCODB, '-o', species, '--cpu', str(args.cpus), '-f'] lib.runSubprocess(cmd, '.', lib.log) #check that it ran correctly busco_results = os.path.join('run_'+species, 'full_table_'+species+'.tsv') if not lib.checkannotations(busco_results): lib.log.error("BUSCO failed, check logfile") sys.exit(1) nameChange = {}
description='''Script that sorts input by length and then renames contig headers.''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class = MyFormatter) parser.add_argument('-i','--input', required=True, help='Multi-fasta genome file') parser.add_argument('-o','--out', required=True, help='Cleaned output (FASTA)') parser.add_argument('-b','--base', default='scaffold', help='Basename of contig header') args=parser.parse_args() def SortRenameHeaders(input, basename, output): #sort records and write temp file with open(output, 'w') as output: with open(input, 'rU') as input: records = list(SeqIO.parse(input, 'fasta')) records.sort(cmp=lambda x,y: cmp(len(y),len(x))) counter = 1 for rec in records: rec.name = '' rec.description = '' rec.id = basename + '_' + str(counter) if len(rec.id) > 16: print "Error. Fasta header too long %s. Choose a different --base name. Max is 16 characters" % rec.id os._exit(1) counter +=1 SeqIO.write(records, output, 'fasta') Count = lib.countfasta(args.input) print('{0:,}'.format(Count) + ' contigs records loaded') print("Sorting and renaming contig headers") SortRenameHeaders(args.input, args.base, args.out)
organism = f.qualifiers.get("organism", ["???"])[0] if not args.isolate: isolate = f.qualifiers.get("isolate", ["???"])[0] else: isolate = args.isolate break else: organism = args.species if not args.isolate: isolate = '???' else: isolate = args.isolate ############################################################################ #start workflow here ProtCount = lib.countfasta(Proteins) lib.log.info('{0:,}'.format(ProtCount) + ' protein records loaded') #run PFAM-A search lib.log.info("Running HMMer search of PFAM domains") pfam_results = os.path.join(outputdir, 'annotate_misc', 'annotations.pfam.txt') if not lib.checkannotations(pfam_results): lib.PFAMsearch(Proteins, args.cpus, 1e-50, os.path.join(outputdir, 'annotate_misc'), pfam_results) num_annotations = lib.line_count(pfam_results) lib.log.info('{0:,}'.format(num_annotations) + ' annotations added') #run SwissProt Blast search lib.log.info("Running Blastp search of UniProt DB") blast_out = os.path.join(outputdir, 'annotate_misc', 'annotations.swissprot.txt') if not lib.checkannotations(blast_out): lib.SwissProtBlast(Proteins, args.cpus, 1e-5, os.path.join(outputdir, 'annotate_misc'), blast_out) num_annotations = lib.line_count(blast_out)
def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname, output): ''' function will run PASA align assembly and then choose best gene models for training ''' if cpus > 2: pasa_cpus = cpus / 2 else: pasa_cpus = 2 #create tmpdir folder = os.path.join(tmpdir, 'pasa') if not os.path.isdir(folder): os.makedirs(folder) #create pasa and transdecoder logfiles pasa_log = os.path.join(folder, 'pasa.log') transdecoder_log = os.path.join(folder, 'transdecoder.log') #get config files and edit alignConfig = os.path.join(folder, 'alignAssembly.txt') pasaDBname = dbname.replace('-', '_') with open(alignConfig, 'w') as config1: with open( os.path.join(PASA, 'pasa_conf', 'pasa.alignAssembly.Template.txt'), 'rU') as template1: for line in template1: line = line.replace('<__MYSQLDB__>', pasaDBname) config1.write(line) if not os.path.isfile( os.path.join(folder, pasaDBname + '.assemblies.fasta')): #now run first PASA step, note this will dump any database with same name lib.log.info( "Running PASA alignment step using {:,} transcripts".format( lib.countfasta(transcripts))) cmd = [ os.path.join(PASA, 'scripts', 'Launch_PASA_pipeline.pl'), '-c', os.path.abspath(alignConfig), '-r', '-C', '-R', '-g', os.path.abspath(genome), '--ALIGNERS', 'blat,gmap', '-t', os.path.abspath(transcripts), '--stringent_alignment_overlap', args.pasa_alignment_overlap, '--TRANSDECODER', '--MAX_INTRON_LENGTH', str(intronlen), '--CPU', str(pasa_cpus) ] if stranded != 'no': cmd = cmd + ['--transcribed_is_aligned_orient'] lib.runSubprocess(cmd, folder, lib.log) else: lib.log.info('Existing PASA assemblies found {:}'.format( os.path.join(folder, pasaDBname + '.assemblies.fasta'))) #generate TSV gene-transcripts numLoci = getPASAtranscripts2genes( os.path.join(folder, pasaDBname + '.pasa_assemblies.gff3'), os.path.join(folder, 'pasa.gene2transcripts.tsv')) numTranscripts = lib.countfasta( os.path.join(folder, pasaDBname + '.assemblies.fasta')) lib.log.info( "Assigned {:,} transcipts to {:,} loci using {:}% overlap threshold". format(numTranscripts, numLoci, args.pasa_alignment_overlap)) lib.log.info("Getting PASA models for training with TransDecoder") pasa_training_gff = os.path.join( folder, pasaDBname + '.assemblies.fasta.transdecoder.genome.gff3') if lib.which('TransDecoder.LongOrfs') and lib.which( 'TransDecoder.Predict'): cmd = [ 'TransDecoder.LongOrfs', '-t', pasaDBname + '.assemblies.fasta', '--gene_trans_map', 'pasa.gene2transcripts.tsv' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ 'TransDecoder.Predict', '-t', pasaDBname + '.assemblies.fasta', '--single_best_only' ] lib.runSubprocess(cmd, folder, lib.log) cmd = [ os.path.join(PASA, 'pasa-plugins', 'transdecoder', 'cdna_alignment_orf_to_genome_orf.pl'), pasaDBname + '.assemblies.fasta.transdecoder.gff3', pasaDBname + '.pasa_assemblies.gff3', pasaDBname + '.assemblies.fasta' ] lib.runSubprocess2(cmd, folder, lib.log, pasa_training_gff) else: cmd = [ os.path.join(PASA, 'scripts', 'pasa_asmbls_to_training_set.dbi'), '--pasa_transcripts_fasta', pasaDBname + '.assemblies.fasta', '--pasa_transcripts_gff3', pasaDBname + '.pasa_assemblies.gff3' ] lib.runSubprocess(cmd, folder, lib.log) #grab final result shutil.copyfile(pasa_training_gff, output)