Example #1
0
def run_annotation(infile,
                   prefix=None,
                   ident=70,
                   threads=4,
                   kingdom='bacteria',
                   trusted=None,
                   trusted_format='uniprot',
                   callback=None,
                   **kwargs):
    """
    Annotate nucelotide sequences (usually a draft assembly with contigs)
    using prodigal and blast to prokka seqs. Writes a genbank file to the
    same folder.
    Args:
        infile: input fasta file
        prefix: optional prefix for locus_tags
        ident: minimum percent identity for blast results, default 70
        threads: cpu threads to use
        kingdom: 'bacteria', 'viruses' or 'archaea'
        trusted: optional fasta file of trusted protein sequences for blast
    returns:
        a dataframe of annotations and a list of SeqRecords with the features, one per contig
    """

    #get simple name for contig
    def get_contig(x):
        return ('_').join(x.split('_')[:-1])

    if prefix == None:
        prefix = create_locus_tag(infile)
    sprot = 'sprot_%s' % kingdom
    dbs = ['IS', 'amr', trusted, sprot]
    evalues = [1e-10, 1e-100, 1e-4, 1e-4]
    #run prodigal
    resfile = prodigal(infile)
    #read in prodigal fasta to dataframe
    df = tools.fasta_to_dataframe(resfile)
    df[['start', 'end',
        'strand']] = df.description.apply(get_prodigal_coords, 1)
    df['feat_type'] = 'CDS'
    df['contig'] = df['name'].apply(get_contig)
    df['sequence'] = df.sequence.str.rstrip('*')
    #get target seqs
    seqs = list(SeqIO.parse(resfile, 'fasta'))
    #remove temp prodigal file
    os.remove(resfile)
    #remove trailing asterisks
    #seqs = [s.rstrip("*") for s in seqs]
    #read input file nucleotide seqs
    contigs = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
    #print (df[:5])
    res = []
    i = 0
    for db in dbs:
        if db is None:
            i += 1
            continue
        fetch_sequence_from_url(db, path=prokkadbdir)
        #make blast db of prokka proteins
        if db in ['IS', 'amr', sprot]:
            dbname = os.path.join(prokkadbdir, '%s.fa' % db)
        else:
            dbname = db
        tools.make_blast_database(dbname, dbtype='prot')
        print('blasting %s ORFs to %s' % (len(seqs), db))
        bl = tools.blast_sequences(dbname,
                                   seqs,
                                   maxseqs=1,
                                   evalue=evalues[i],
                                   cmd='blastp',
                                   show_cmd=True,
                                   threads=threads,
                                   **kwargs)
        bl = bl[bl.pident > ident]
        #print (bl)
        if len(bl) == 0:
            i += 1
            continue
        #get hit info from header (prokka format)
        try:
            bl[['protein_id', 'gene', 'product',
                'cog']] = bl.stitle.apply(prokka_header_info, 1)
        except:
            bl[['protein_id', 'gene', 'product',
                'cog']] = bl.stitle.apply(uniprot_header_info, 1)
        cols = [
            'qseqid', 'sseqid', 'pident', 'sstart', 'send', 'protein_id',
            'gene', 'product'
        ]

        bl = bl.sort_values(['qseqid', 'pident'],
                            ascending=False).drop_duplicates(['qseqid'])[cols]
        #print (len(bl))

        #merge blast result with prodigal sequences
        found = df.merge(bl, left_on='name', right_on='qseqid', how='right')
        #get remaining sequences with no hits to this db
        df = df[~df.name.isin(bl.qseqid)]
        #new sequences to blast in the next iteration
        seqs = tools.dataframe_to_seqrecords(df, idkey='name')
        #print (found)
        res.append(found)
        print('%s sequences unassigned' % len(df))
        i += 1
    #all results together
    res = pd.concat(res)
    #print (res)

    #-------------------------------------------------------
    #run hmmer on unassigned
    if kingdom == 'bacteria':
        print('running hmmer')
        #write unknowns out
        SeqIO.write(seqs, 'unknowns.fa', 'fasta')
        hmmdf = hmmer('unknowns.fa', threads=threads)
        if hmmdf is not None:
            res = pd.concat([res, hmmdf], sort=False)

    #get tRNAs with aragorn
    print('running aragorn')
    arag = aragorn(infile)
    #print (arag)
    res = pd.concat([res, arag], sort=False)

    #remaining unknowns are hypothetical proteins
    unknown = df[~df.name.isin(res.name)].copy()
    unknown['product'] = 'hypothetical protein'
    res = pd.concat([res, unknown], sort=False)

    #post process dataframe
    #res = res.fillna('')
    res['translation'] = res.sequence
    res['length'] = res.sequence.str.len()
    #res['gene'] = res.gene.fillna('')
    res = res.reset_index(drop=True)

    #print (res['product'].value_counts())
    #print (res.dtypes)

    #-------------------------------------------------------
    #we then write all found sequences to seqrecord/features
    l = 1  #counter for assigning locus tags
    recs = []
    #print (res.iloc[])
    #group by contig and get features for each protein found
    for c, df in res.groupby('contig'):
        contig = get_contig(c)
        #truncated label for writing to genbank
        label = ('_').join(c.split('_')[:2])
        #print (c, len(df), label)
        nucseq = contigs[c].seq
        rec = SeqRecord(nucseq, annotations={"molecule_type": "DNA"})
        #rec.seq.alphabet = generic_dna
        rec.id = label
        rec.name = label
        rec.COMMENT = 'annotated with pathogenie'
        df = df.sort_values('start')
        qcols = ['gene', 'product', 'locus_tag', 'translation', 'length']
        for i, row in df.iterrows():
            row['locus_tag'] = '{p}_{l:05d}'.format(p=prefix, l=l)
            row = row.dropna()
            cols = [c for c in qcols if c in row.index]
            quals = row[cols].to_dict()
            #print (quals)
            feat = SeqFeature(FeatureLocation(row.start, row.end, row.strand),
                              strand=row.strand,
                              type=row.feat_type,
                              qualifiers=quals)
            rec.features.append(feat)
            l += 1
        recs.append(rec)
    print('done')
    return res, recs