def get_hmmdb_go_terms( acc, c ):
    """
    This returns a list of bioannotation:GOAnnotation objects
    """
    qry = "SELECT hg.go_id FROM hmm_go hg JOIN hmm ON hg.hmm_id=hmm.id WHERE hmm.accession = ?"
    c.execute(qry, (acc,))

    go_annots = list()
    
    for row in c:
        go = annotation.GOAnnotation(go_id=row[0], ev_code='ISM', with_from=acc)
        go_annots.append(go)
    
    return go_annots
def get_uniref_go_terms( acc, c ):
    """
    This returns a list of bioannotation:GOAnnotation objects
    """
    qry = """
          SELECT us_go.go_id
            FROM uniref_go us_go
           WHERE us_go.id = ?
          """
    c.execute(qry, (acc,))

    go_annots = list()
    
    for row in c:
        go = annotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc)
        go_annots.append(go)
    
    return go_annots
def get_uspdb_go_terms( acc, c ):
    """
    This returns a list of bioannotation:GOAnnotation objects
    """
    qry = """
          SELECT us_go.go_id
            FROM uniprot_sprot_go us_go
                 JOIN uniprot_sprot_acc us_acc ON us_go.id=us_acc.id
           WHERE us_acc.accession = ?
          """
    c.execute(qry, (acc,))

    go_annots = list()
    
    for row in c:
        go = annotation.GOAnnotation(go_id=row[0], ev_code='ISA', with_from=acc)
        go_annots.append(go)
    
    return go_annots
Exemple #4
0
def get_go_annotations(feat):
    """
    Looks for sections like this to extract GO terms

     CDS             join(3366667..3366969,3463389..3463463)
                     /gene="ENSMUSG00000040653.6"
                     /protein_id="ENSMUSP00000149688.1"
                     /db_xref="EMBL:AC162384"
                     /db_xref="EMBL:CH466562"
                     /db_xref="GO:0005737"
                     /db_xref="GO:0042325"
    """
    go_terms = list()

    if 'db_xref' in feat.qualifiers:
        for dbxref in feat.qualifiers['db_xref']:
            m = re.match('GO:(.+)', dbxref)
            if m:
                go_terms.append(annotation.GOAnnotation(go_id=m.group(1)))

    return go_terms
def parse_tmhmm_evidence(log_fh, polypeptides, htab_list):
    '''
    Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation
    attributes where possible.

    Notes from the esteemed M Giglio:
    The GO term to use would be GO:0016021 "integral component of membrane"
    Or if you want to be more conservative you could go with GO:0016020 "membrane"
    
    Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM
    domains and then we call it putative integral membrane protein. 

    On ECO - in fact Marcus and I are the developers of ECO.  It is an ontology of evidence types.
    An annotation to an ECO term is used in conjunction with another annotation, like a GO term
    (but many other types of annotation can, and are, used with ECO). It provides additional
    information about the annotation. In fact for GO, the assignment of an evidence term along
    with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.)

    INPUT: Expected TMHMM input (all HTML lines are skipped)
    # CHARM010_V2.mRNA.887 Length: 904
    # CHARM010_V2.mRNA.887 Number of predicted TMHs:  6
    # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638
    # CHARM010_V2.mRNA.887 Exp number, first 60 AAs:  21.83212
    # CHARM010_V2.mRNA.887 Total prob of N-in:        0.99994
    # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	     1    11
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	    12    34
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	    35   712
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   713   735
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   736   755
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   756   773
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   774   782
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   783   805
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   806   809
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   810   832
    CHARM010_V2.mRNA.887	TMHMM2.0	outside	   833   871
    CHARM010_V2.mRNA.887	TMHMM2.0	TMhelix	   872   894
    CHARM010_V2.mRNA.887	TMHMM2.0	inside	   895   904
    '''
    # The number of helices spanning the membrane required before counted as a membrane protein
    MIN_HELICAL_SPANS = 3

    # For successful matches, this is the product name which gets applied
    GENE_PRODUCT_NAME = 'Putative integral membrane protein'

    for file in utils.read_list_file(htab_list):
        last_qry_id = None
        current_helix_count = 0

        for line in open(file):
            if line.startswith('<'): continue
            m = re.match("# (.+?)\s+Length: \d+", line)

            if m:
                current_id = m.group(1)

                # purge previous result
                if current_helix_count >= MIN_HELICAL_SPANS:
                    annot = polypeptides[last_qry_id].annotation

                    if annot.product_name == DEFAULT_PRODUCT_NAME:
                        annot.product_name = GENE_PRODUCT_NAME
                        log_fh.write(
                            "INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n"
                            .format(last_qry_id, annot.product_name,
                                    current_helix_count))
                    else:
                        log_fh.write(
                            "INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n"
                            .format(last_qry_id, current_helix_count))

                    ## we add the GO terms no matter what
                    annot.add_go_annotation(
                        annotation.GOAnnotation(go_id='0016021'))

                # reset
                last_qry_id = current_id
                current_helix_count = 0
                continue

            cols = line.split()
            if len(cols) == 5 and cols[2] == 'TMhelix':
                current_helix_count += 1