Python SeqRecord.COMMENT Examples

Programming Language: Python

Namespace/Package Name: Bio.SeqRecord

Class/Type: SeqRecord

Method/Function: COMMENT

Examples at hotexamples.com: 1

Python SeqRecord.COMMENT - 1 examples found. These are the top rated real world Python examples of Bio.SeqRecord.SeqRecord.COMMENT extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SeqRecord(30)

annotations(30)

__init__(8)

_al_start(4)

_al_stop(4)

__add__(2)

COMMENT(1)

replace(1)

populate_attribs(1)

direction(1)

attributes(1)

_set_seq(1)

add_conservation_features(1)

accession(1)

__repr__(1)

__getitem__(1)

split(1)

Example #1

Show file

File: app.py Project: WenchaoLin/pathogenie

def run_annotation(infile,
                   prefix=None,
                   ident=70,
                   threads=4,
                   kingdom='bacteria',
                   trusted=None,
                   trusted_format='uniprot',
                   callback=None,
                   **kwargs):
    """
    Annotate nucelotide sequences (usually a draft assembly with contigs)
    using prodigal and blast to prokka seqs. Writes a genbank file to the
    same folder.
    Args:
        infile: input fasta file
        prefix: optional prefix for locus_tags
        ident: minimum percent identity for blast results, default 70
        threads: cpu threads to use
        kingdom: 'bacteria', 'viruses' or 'archaea'
        trusted: optional fasta file of trusted protein sequences for blast
    returns:
        a dataframe of annotations and a list of SeqRecords with the features, one per contig
    """

    #get simple name for contig
    def get_contig(x):
        return ('_').join(x.split('_')[:-1])

    if prefix == None:
        prefix = create_locus_tag(infile)
    sprot = 'sprot_%s' % kingdom
    dbs = ['IS', 'amr', trusted, sprot]
    evalues = [1e-10, 1e-100, 1e-4, 1e-4]
    #run prodigal
    resfile = prodigal(infile)
    #read in prodigal fasta to dataframe
    df = tools.fasta_to_dataframe(resfile)
    df[['start', 'end',
        'strand']] = df.description.apply(get_prodigal_coords, 1)
    df['feat_type'] = 'CDS'
    df['contig'] = df['name'].apply(get_contig)
    df['sequence'] = df.sequence.str.rstrip('*')
    #get target seqs
    seqs = list(SeqIO.parse(resfile, 'fasta'))
    #remove temp prodigal file
    os.remove(resfile)
    #remove trailing asterisks
    #seqs = [s.rstrip("*") for s in seqs]
    #read input file nucleotide seqs
    contigs = SeqIO.to_dict(SeqIO.parse(infile, 'fasta'))
    #print (df[:5])
    res = []
    i = 0
    for db in dbs:
        if db is None:
            i += 1
            continue
        fetch_sequence_from_url(db, path=prokkadbdir)
        #make blast db of prokka proteins
        if db in ['IS', 'amr', sprot]:
            dbname = os.path.join(prokkadbdir, '%s.fa' % db)
        else:
            dbname = db
        tools.make_blast_database(dbname, dbtype='prot')
        print('blasting %s ORFs to %s' % (len(seqs), db))
        bl = tools.blast_sequences(dbname,
                                   seqs,
                                   maxseqs=1,
                                   evalue=evalues[i],
                                   cmd='blastp',
                                   show_cmd=True,
                                   threads=threads,
                                   **kwargs)
        bl = bl[bl.pident > ident]
        #print (bl)
        if len(bl) == 0:
            i += 1
            continue
        #get hit info from header (prokka format)
        try:
            bl[['protein_id', 'gene', 'product',
                'cog']] = bl.stitle.apply(prokka_header_info, 1)
        except:
            bl[['protein_id', 'gene', 'product',
                'cog']] = bl.stitle.apply(uniprot_header_info, 1)
        cols = [
            'qseqid', 'sseqid', 'pident', 'sstart', 'send', 'protein_id',
            'gene', 'product'
        ]

        bl = bl.sort_values(['qseqid', 'pident'],
                            ascending=False).drop_duplicates(['qseqid'])[cols]
        #print (len(bl))

        #merge blast result with prodigal sequences
        found = df.merge(bl, left_on='name', right_on='qseqid', how='right')
        #get remaining sequences with no hits to this db
        df = df[~df.name.isin(bl.qseqid)]
        #new sequences to blast in the next iteration
        seqs = tools.dataframe_to_seqrecords(df, idkey='name')
        #print (found)
        res.append(found)
        print('%s sequences unassigned' % len(df))
        i += 1
    #all results together
    res = pd.concat(res)
    #print (res)

    #-------------------------------------------------------
    #run hmmer on unassigned
    if kingdom == 'bacteria':
        print('running hmmer')
        #write unknowns out
        SeqIO.write(seqs, 'unknowns.fa', 'fasta')
        hmmdf = hmmer('unknowns.fa', threads=threads)
        if hmmdf is not None:
            res = pd.concat([res, hmmdf], sort=False)

    #get tRNAs with aragorn
    print('running aragorn')
    arag = aragorn(infile)
    #print (arag)
    res = pd.concat([res, arag], sort=False)

    #remaining unknowns are hypothetical proteins
    unknown = df[~df.name.isin(res.name)].copy()
    unknown['product'] = 'hypothetical protein'
    res = pd.concat([res, unknown], sort=False)

    #post process dataframe
    #res = res.fillna('')
    res['translation'] = res.sequence
    res['length'] = res.sequence.str.len()
    #res['gene'] = res.gene.fillna('')
    res = res.reset_index(drop=True)

    #print (res['product'].value_counts())
    #print (res.dtypes)

    #-------------------------------------------------------
    #we then write all found sequences to seqrecord/features
    l = 1  #counter for assigning locus tags
    recs = []
    #print (res.iloc[])
    #group by contig and get features for each protein found
    for c, df in res.groupby('contig'):
        contig = get_contig(c)
        #truncated label for writing to genbank
        label = ('_').join(c.split('_')[:2])
        #print (c, len(df), label)
        nucseq = contigs[c].seq
        rec = SeqRecord(nucseq, annotations={"molecule_type": "DNA"})
        #rec.seq.alphabet = generic_dna
        rec.id = label
        rec.name = label
        rec.COMMENT = 'annotated with pathogenie'
        df = df.sort_values('start')
        qcols = ['gene', 'product', 'locus_tag', 'translation', 'length']
        for i, row in df.iterrows():
            row['locus_tag'] = '{p}_{l:05d}'.format(p=prefix, l=l)
            row = row.dropna()
            cols = [c for c in qcols if c in row.index]
            quals = row[cols].to_dict()
            #print (quals)
            feat = SeqFeature(FeatureLocation(row.start, row.end, row.strand),
                              strand=row.strand,
                              type=row.feat_type,
                              qualifiers=quals)
            rec.features.append(feat)
            l += 1
        recs.append(rec)
    print('done')
    return res, recs