def run_annotation(infile, prefix=None, ident=70, threads=4, kingdom='bacteria', trusted=None, trusted_format='uniprot', callback=None, **kwargs): """ Annotate nucelotide sequences (usually a draft assembly with contigs) using prodigal and blast to prokka seqs. Writes a genbank file to the same folder. Args: infile: input fasta file prefix: optional prefix for locus_tags ident: minimum percent identity for blast results, default 70 threads: cpu threads to use kingdom: 'bacteria', 'viruses' or 'archaea' trusted: optional fasta file of trusted protein sequences for blast returns: a dataframe of annotations and a list of SeqRecords with the features, one per contig """ #get simple name for contig def get_contig(x): return ('_').join(x.split('_')[:-1]) if prefix == None: prefix = create_locus_tag(infile) sprot = 'sprot_%s' % kingdom dbs = ['IS', 'amr', trusted, sprot] evalues = [1e-10, 1e-100, 1e-4, 1e-4] #run prodigal resfile = prodigal(infile) #read in prodigal fasta to dataframe df = tools.fasta_to_dataframe(resfile) df[['start', 'end', 'strand']] = df.description.apply(get_prodigal_coords, 1) df['feat_type'] = 'CDS' df['contig'] = df['name'].apply(get_contig) df['sequence'] = df.sequence.str.rstrip('*') #get target seqs seqs = list(SeqIO.parse(resfile, 'fasta')) #remove temp prodigal file os.remove(resfile) #remove trailing asterisks #seqs = [s.rstrip("*") for s in seqs] #read input file nucleotide seqs contigs = SeqIO.to_dict(SeqIO.parse(infile, 'fasta')) #print (df[:5]) res = [] i = 0 for db in dbs: if db is None: i += 1 continue fetch_sequence_from_url(db, path=prokkadbdir) #make blast db of prokka proteins if db in ['IS', 'amr', sprot]: dbname = os.path.join(prokkadbdir, '%s.fa' % db) else: dbname = db tools.make_blast_database(dbname, dbtype='prot') print('blasting %s ORFs to %s' % (len(seqs), db)) bl = tools.blast_sequences(dbname, seqs, maxseqs=1, evalue=evalues[i], cmd='blastp', show_cmd=True, threads=threads, **kwargs) bl = bl[bl.pident > ident] #print (bl) if len(bl) == 0: i += 1 continue #get hit info from header (prokka format) try: bl[['protein_id', 'gene', 'product', 'cog']] = bl.stitle.apply(prokka_header_info, 1) except: bl[['protein_id', 'gene', 'product', 'cog']] = bl.stitle.apply(uniprot_header_info, 1) cols = [ 'qseqid', 'sseqid', 'pident', 'sstart', 'send', 'protein_id', 'gene', 'product' ] bl = bl.sort_values(['qseqid', 'pident'], ascending=False).drop_duplicates(['qseqid'])[cols] #print (len(bl)) #merge blast result with prodigal sequences found = df.merge(bl, left_on='name', right_on='qseqid', how='right') #get remaining sequences with no hits to this db df = df[~df.name.isin(bl.qseqid)] #new sequences to blast in the next iteration seqs = tools.dataframe_to_seqrecords(df, idkey='name') #print (found) res.append(found) print('%s sequences unassigned' % len(df)) i += 1 #all results together res = pd.concat(res) #print (res) #------------------------------------------------------- #run hmmer on unassigned if kingdom == 'bacteria': print('running hmmer') #write unknowns out SeqIO.write(seqs, 'unknowns.fa', 'fasta') hmmdf = hmmer('unknowns.fa', threads=threads) if hmmdf is not None: res = pd.concat([res, hmmdf], sort=False) #get tRNAs with aragorn print('running aragorn') arag = aragorn(infile) #print (arag) res = pd.concat([res, arag], sort=False) #remaining unknowns are hypothetical proteins unknown = df[~df.name.isin(res.name)].copy() unknown['product'] = 'hypothetical protein' res = pd.concat([res, unknown], sort=False) #post process dataframe #res = res.fillna('') res['translation'] = res.sequence res['length'] = res.sequence.str.len() #res['gene'] = res.gene.fillna('') res = res.reset_index(drop=True) #print (res['product'].value_counts()) #print (res.dtypes) #------------------------------------------------------- #we then write all found sequences to seqrecord/features l = 1 #counter for assigning locus tags recs = [] #print (res.iloc[]) #group by contig and get features for each protein found for c, df in res.groupby('contig'): contig = get_contig(c) #truncated label for writing to genbank label = ('_').join(c.split('_')[:2]) #print (c, len(df), label) nucseq = contigs[c].seq rec = SeqRecord(nucseq, annotations={"molecule_type": "DNA"}) #rec.seq.alphabet = generic_dna rec.id = label rec.name = label rec.COMMENT = 'annotated with pathogenie' df = df.sort_values('start') qcols = ['gene', 'product', 'locus_tag', 'translation', 'length'] for i, row in df.iterrows(): row['locus_tag'] = '{p}_{l:05d}'.format(p=prefix, l=l) row = row.dropna() cols = [c for c in qcols if c in row.index] quals = row[cols].to_dict() #print (quals) feat = SeqFeature(FeatureLocation(row.start, row.end, row.strand), strand=row.strand, type=row.feat_type, qualifiers=quals) rec.features.append(feat) l += 1 recs.append(rec) print('done') return res, recs