Ejemplo n.º 1
0
def parse_annotation_from_column_9(col9):
    annot = bioannotation.FunctionalAnnotation()
    atts = column_9_dict(col9)

    ## List of attributes which may be in column 9 that we want to skip as
    #   as not being involved with annotation.
    skip = ['ID', 'Parent']

    for att in atts:
        if att == 'product_name':
            annot.product_name = atts[att]
        elif att == 'Dbxref':
            ec_nums = list()

            ## TODO: Review this whole conditional
            if isinstance(atts['Dbxref'], str):
                if atts['Dbxref'].startswith("EC"):
                    ec_nums.append(atts['Dbxref'])
                else:
                    annot.add_dbxref(atts['Dbxref'])
            else:
                for dbxref in atts['Dbxref']:
                    if dbxref.startswith("EC"):
                        ec_nums.append(dbxref)
                    else:
                        annot.add_dbxref(dbxref)

            for ec_num in ec_nums:
                ec_annot = bioannotation.ECAnnotation(number=ec_num)
                annot.add_ec_number(ec_annot)
        elif att == 'Ontology_term':
            ont_terms = list()

            if isinstance(atts['Ontology_term'],
                          str) and atts['Ontology_term'].startswith("GO"):
                ont_terms.append(atts['Ontology_term'])
            else:
                for term in atts['Ontology_term']:
                    if term.startswith("GO"):
                        ont_terms.append(term)

            for go_id in ont_terms:
                go_annot = bioannotation.GOAnnotation(go_id=go_id)
                annot.add_go_annotation(go_annot)
        elif att == 'gene_symbol':
            annot.gene_symbol = atts[att]
        elif att not in skip:
            ## just save any other attributes provided
            annot.other_attributes[att] = atts[att]

    return annot
Ejemplo n.º 2
0
def get_hmmdb_ec_nums( acc, c ):
    """
    This returns a list of bioannotation:ECAnnotation objects
    """
    qry = "SELECT he.ec_id FROM hmm_ec he JOIN hmm ON he.hmm_id=hmm.id WHERE hmm.accession = ?"
    c.execute(qry, (acc,))

    ec_annots = list()

    for row in c:
        ec = bioannotation.ECAnnotation(number=row[0])
        ec_annots.append(ec)
    
    return ec_annots
Ejemplo n.º 3
0
def get_uniref_ec_nums( acc, c ):
    """
    This returns a list of bioannotation:ECAnnotation objects
    """
    qry = """
          SELECT us_ec.ec_num
            FROM uniref_ec us_ec
           WHERE us_ec.id = ?
          """
    c.execute(qry, (acc,))

    ec_annots = list()

    for row in c:
        ec = bioannotation.ECAnnotation(number=row[0])
        ec_annots.append(ec)

    return ec_annots
Ejemplo n.º 4
0
def get_uspdb_ec_nums( acc, c ):
    """
    This returns a list of bioannotation:ECAnnotation objects
    """
    qry = """
          SELECT us_ec.ec_num
            FROM uniprot_sprot_ec us_ec
                 JOIN uniprot_sprot_acc us_acc ON us_ec.id=us_acc.id
           WHERE us_acc.accession = ?
          """
    c.execute(qry, (acc,))

    ec_annots = list()

    for row in c:
        ec = bioannotation.ECAnnotation(number=row[0])
        ec_annots.append(ec)

    return ec_annots
Ejemplo n.º 5
0
def parse_annotation_from_column_9(col9):
    annot = bioannotation.FunctionalAnnotation()
    atts = column_9_dict(col9)

    if 'product_name' in atts:
        annot.product_name = atts['product_name']

    if 'Dbxref' in atts:
        ec_nums = list()

        if isinstance(atts['Dbxref'], str) and atts['Dbxref'].startswith("EC"):
            ec_nums.append(atts['Dbxref'])
        else:
            for dbxref in atts['Dbxref']:
                if dbxref.startswith("EC"):
                    ec_nums.append(dbxref)

        for ec_num in ec_nums:
            ec_annot = bioannotation.ECAnnotation(number=ec_num)
            annot.add_ec_number(ec_annot)

    if 'Ontology_term' in atts:
        ont_terms = list()

        if isinstance(atts['Ontology_term'],
                      str) and atts['Ontology_term'].startswith("GO"):
            ont_terms.append(atts['Ontology_term'])
        else:
            for term in atts['Ontology_term']:
                if term.startswith("GO"):
                    ont_terms.append(term)

        for go_id in ont_terms:
            go_annot = bioannotation.GOAnnotation(go_id=go_id)
            annot.add_go_annotation(go_annot)

    return annot
Ejemplo n.º 6
0
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff):
    '''
    Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides,
    populating each with Annotation evidence where appropriate.  Only attaches evidence if
    the product name is the default.

    Currently only considers the top BLAST hit for each query which doesn't have
    'uncharacterized' or hypothetical in the product name.
    '''
    for file in biocodeutils.read_list_file(blast_list):
        last_qry_id = None
        
        for line in open(file):
            line = line.rstrip()
            cols = line.split("\t")

            # We're going to ignore any lines which have a few keywords in the name
            # First character left off for initcap reasons
            if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]:
                continue
            
            this_qry_id = cols[0]

            # skip this line if it doesn't meet the cutoff
            if float(cols[19]) > eval_cutoff:
                continue

            # the BLAST hits are sorted already with the top hit for each query first
            if last_qry_id != this_qry_id:
                annot = polypeptides[this_qry_id].annotation

                # get the accession from the cols[5]
                accession = cols[5]

                # save it, unless the gene product name has already changed from the default
                if annot.product_name == DEFAULT_PRODUCT_NAME:
                    accession = cols[5]

                    # the product field looks like this:
                    # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72]
                    # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle
                    if ' [EC' in cols[15] and cols[15].endswith(']'):
                        m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15])
                    else:
                        m = re.search("\; (K\d+)\s+(.+)", cols[15])

                    if m:
                        kegg_id = m.group(1)
                        product = m.group(2)
                        
                        if len(m.groups()) == 3:
                            ec_num = m.group(3)
                        else:
                            ec_num = None

                        annot.product_name = product
                        log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n".format(this_qry_id, annot.product_name, accession))

                        if ec_num is not None and ec_num is not '':
                            ec = bioannotation.ECAnnotation(number=ec_num)
                            annot.add_ec_number(ec)

                        kegg_dbxref = bioannotation.Dbxref(db='KEGG', identifier=kegg_id)
                        annot.add_dbxref(kegg_dbxref)
                        
                # remember the ID we just saw
                last_qry_id = this_qry_id