Esempio n. 1
0
def run_phenolog(ont, aset, args):
    """
    Like run_enrichment_test, but uses classes from a 2nd ontology/assocset to build the gene set.
    """
    ofactory = OntologyFactory()
    ont2 = ofactory.create(args.resource2)

    afactory = AssociationSetFactory()
    aset2 = afactory.create(ontology=ont2, file=args.file2)

    # only test for genes (or other subjects of statements) in common
    common = set(aset.subjects).intersection(aset2.subjects)
    num_common = len(common)
    logging.info("Genes in common between two KBs: {}/\{} = {}".format(
        len(aset.subjects), len(aset2.subjects), num_common))
    if num_common < 2:
        logging.error("TOO FEW")
        return None
    for n in aset.ontology.nodes():
        nl = ont.label(n, id_if_null=True)
        genes = aset.query([n])
        num_genes = len(genes)
        if num_genes > 2:
            logging.info("BASE: {} {} num={}".format(n, nl, num_genes))
            enr = aset2.enrichment_test(subjects=genes,
                                        background=aset2.subjects,
                                        labels=True)
            for r in enr:
                print("{:8.3g} {} {:20s} <-> {} {:20s}".format(
                    r['p'], n, nl, r['c'], str(r['n'])))
    def get(self):
        """
        Summary statistics for objects associated
        """
        args = parser.parse_args()

        M = GolrFields()
        ont = None
        ocat = args.get('object_category')
        ontid = args.get('ontology')
        if ontid is None:
            if ocat == 'function':
                ontid = 'go'
            if ocat == 'phenotype':
                # TODO: other phenotype ontologies
                ontid = 'hp'

        print("Loading: {}".format(ontid))
        ont = get_ontology(ontid)
        taxid = args.get('taxon')
        max_p_value = float(args.max_p_value)

        subjects = args.get('subject')
        background = args.get('background')
        afactory = AssociationSetFactory()
        aset = afactory.create(ontology=ont,
                               subject_category='gene',
                               object_category=ocat,
                               taxon=taxid)
        enr = aset.enrichment_test(subjects=subjects,
                                   threshold=max_p_value,
                                   labels=True)
        return {'results': enr}
Esempio n. 3
0
def test_semsearch():
    afa = AssociationSetFactory()
    f = POMBASE
    ont = OntologyFactory().create(ONT)
    parser = GafParser()
    assocs = parser.parse(POMBASE, skipheader=True)
    assocs = [a for a in assocs if a['subject']['label'] in GENES]
    aset = afa.create_from_assocs(assocs, ontology=ont)
    ont = aset.subontology()
    aset.ontology = ont
    logging.info('Genes={} Terms={}'.format(len(aset.subjects),
                                            len(ont.nodes())))

    print('STATS={}'.format(aset.as_dataframe().describe()))

    #genes = aset.subjects[0:5]
    sse = SemSearchEngine(assocmodel=aset)

    logging.info('Calculating all MICAs')
    sse.calculate_all_micas()

    #h5path = 'tests/resources/mica_ic.h5'
    #logging.info('Saving to {}'.format(h5path))
    #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w')
    #logging.info('Saved to {}'.format(h5path))

    logging.info('Doing pairwise')
    for i in aset.subjects:
        for j in aset.subjects:
            sim = sse.pw_score_cosine(i, j)
            #print('{} x {} = {}'.format(i,j,sim))
            if i == j:
                assert (sim > 0.9999)
            tups = sse.pw_score_resnik_bestmatches(i, j)
            print('{} x {} = {} // {}'.format(i, j, sim, tups))
Esempio n. 4
0
def test_factory():
    afa = AssociationSetFactory()
    ont = OntologyFactory().create(ONT)
    aset = afa.create_from_file(POMBASE, ontology=ont, skim=False)

    found = 0
    for s in aset.subjects:
        print('{} {}'.format(s, aset.label(s)))
        for c in aset.annotations(s):
            print('  {} {}'.format(c, ont.label(c)))
            for a in aset.associations(s, c):
                e = a['evidence']
                print('    {} {} {}'.format(e['type'], e['with_support_from'],
                                            e['has_supporting_reference']))
                if s == 'PomBase:SPBC2D10.10c' and c == 'GO:0005730':
                    if e['type'] == 'ISO':
                        if e['with_support_from'] == [
                                'SGD:S000002172'
                        ] and e['has_supporting_reference'] == [
                                'GO_REF:0000024'
                        ]:
                            found += 1
                            logging.info('** FOUND: {}'.format(a))
                    if e['type'] == 'IDA':
                        if e['has_supporting_reference'] == ['PMID:16823372']:
                            found += 1
                            logging.info('** FOUND: {}'.format(a))

    assert len(aset.associations_by_subj) > 0
    assert found == 2
Esempio n. 5
0
 def __init__(self, ont: str, taxon: str) -> None:
     self.associations = None
     self.ont = ont
     self.taxon = taxon
     self.ontology = ''
     self.assocs = ''
     self.afactory = AssociationSetFactory()
     self.load_associations()
Esempio n. 6
0
def test_create_from_file_no_fmt():
    """
    Test loading from gaf while setting fmt to None
    """
    ont = OntologyFactory().create('go')
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt=None, file=POMBASE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 100
Esempio n. 7
0
 def load_associations(self, group):
     p = GafParser()
     afactory = AssociationSetFactory()
     url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format(
         group)
     if group == 'human':
         url = "http://geneontology.org/gene-associations/goa_human.gaf.gz"
     assocs = p.parse(url)
     assocs = [x for x in assocs if 'header' not in x.keys()]
     self.associations = afactory.create_from_assocs(assocs,
                                                     ontology=self.ontology)
Esempio n. 8
0
def test_gaf():
    """
    Test loading from gaf
    """
    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    ont = ofactory.create('go')
    aset = afactory.create_from_gaf(open(POMBASE, "r"), ontology=ont)
    print(str(aset))
    genes = aset.query([INTRACELLULAR])
    for g in genes:
        print("G={} '{}'".format(g, aset.label(g)))
    assert G1 in genes
Esempio n. 9
0
def test_remote_disease():
    """
    factory test
    """
    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    ont = ofactory.create('doid')
    aset = afactory.create(ontology=ont,
                           subject_category='disease',
                           object_category='phenotype',
                           taxon=HUMAN)

    rs = aset.query_associations([PD])
    print("Gene Assocs to PD: {} {}".format(rs, len(rs)))
Esempio n. 10
0
def test_learn():
    afa = AssociationSetFactory()
    ont = OntologyFactory().create(ONT)

    aset = afa.create_from_file(file=GAF, ontology=ont)
    learner = ol.OntologyLearner(assocs=aset)
    print('L={}'.format(learner))
    subont = ont.subontology(relations=['subClassOf'])
    learner.split_assocs(CC, ontology=subont)
    print('L.assocs={}'.format(learner.assocs))
    print('L.tassocs={}'.format(learner.target_assocs))
    with open('target/index.md', 'w') as file:
        learner.fit_all(reportfile=file)
    print('L.targets={}'.format(learner.targets))
Esempio n. 11
0
def test_learn():
    ont = OntologyFactory().create(ONT)
    tont = OntologyFactory().create(TONT)
    afa = AssociationSetFactory()
    aset = afa.create_from_file(file=GAF, ontology=ont)
    taset = afa.create_from_file(file=TGAF, ontology=tont)

    learner = ol.OntologyLearner(assocs=aset,
                                 target_assocs=taset,
                                 score_threshold=0.6)
    print('L={}'.format(learner))
    print('L.assocs={}'.format(learner.assocs))
    print('L.tassocs={}'.format(learner.target_assocs))
    with open('target/pheno_index.md', 'w') as file:
        learner.fit_all(reportfile=file)
    print('L.targets={}'.format(learner.targets))
    def load_associations(self, ontology_name:str=None, subject_category:str=None, object_category:str=None, evidence=None, taxon:str=None, relation=None, file:Union[str, TextIO]=None, fmt:str=None, skim:bool=False) -> None:
        ofactory = OntologyFactory()
        afactory = AssociationSetFactory()

        ontology = ofactory.create(ontology_name, subject_category)

        self.associations = afactory.create(
            ontology=ontology,
            subject_category=subject_category,
            object_category=object_category,
            evidence=evidence,
            taxon=taxon,
            relation=relation,
            file=file,
            fmt=fmt,
            skim=skim
        )
Esempio n. 13
0
def test_remote_go():
    """
    factory test
    """
    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    ont = ofactory.create('go').subontology(relations=['subClassOf', PART_OF])
    aset = afactory.create(ontology=ont,
                           subject_category='gene',
                           object_category='function',
                           taxon=MOUSE)

    rs = aset.query([TRANSCRIPTION_FACTOR], [])
    print("Mouse genes annotated to TF: {} {}".format(rs, len(rs)))
    for g in rs:
        print("  Gene: {} {}".format(g, aset.label(g)))
    set_tf = rs

    rs = aset.query([NUCLEUS], [])
    print("Mouse genes annotated to nucleus: {} {}".format(rs, len(rs)))
    set_nucleus = rs
    assert (len(rs) > 100)

    rs = aset.query([TRANSCRIPTION_FACTOR, NUCLEUS], [])
    print("Mouse TF genes annotated to nucleus: {} {}".format(rs, len(rs)))
    assert (len(rs) > 100)
    set_nucleus_tf = rs
    assert (len(rs) < len(set_nucleus))

    rs = aset.query([NUCLEUS], [TRANSCRIPTION_FACTOR])
    print("Mouse non-TF genes annotated to nucleus: {} {}".format(rs, len(rs)))
    assert (len(rs) > 100)
    set_nucleus_non_tf = rs
    assert (len(rs) < len(set_nucleus))
    assert (len(set_nucleus_tf) + len(set_nucleus_non_tf) == len(set_nucleus))

    enr = aset.enrichment_test(subjects=set_tf, labels=True)
    print("ENRICHMENT (tf): {}".format(enr))
    [match] = [x for x in enr if x['c'] == NUCLEUS]
    print("ENRICHMENT (tf) for NUCLEUS: {}".format(match))
    assert match['p'] < 0.00001
Esempio n. 14
0
def test_learn_from_phenotype():
    """
    Learn GO from Phenotypes

    (note: some phenotypes in FYPO have graph paths to GO classes,
    so GO will be used to predict GO, which may seem circular, but
    in fact the phenotype is different information)
    """
    ont = OntologyFactory().create(ONT)
    tont = OntologyFactory().create(TONT)
    afa = AssociationSetFactory()
    aset = afa.create_from_file(file=GAF, ontology=ont)
    taset = afa.create_from_file(file=TGAF, ontology=tont)

    learner = ol.OntologyLearner(assocs=aset,
                                 target_assocs=taset,
                                 score_threshold=0.9)
    print('L={}'.format(learner))
    print('L.assocs={}'.format(learner.assocs))
    print('L.tassocs={}'.format(learner.target_assocs))
    dir = 'target/from_phenotype'
    with open(dir + '/index.md', 'w') as file:
        learner.fit_all(dir=dir, reportfile=file)
    print('L.targets={}'.format(learner.targets))
log = logging.getLogger(__name__)

from ontobio.ontol_factory import OntologyFactory
from ontobio.assoc_factory import AssociationSetFactory

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Sample script to open phenotypes')
    args = parser.parse_args()
    parser.add_argument('input', help='Input')

    ## Create an ontology factory in order to fetch HPO
    ofactory = OntologyFactory()
    ont = ofactory.create("hp")

    ## Create an association factory to get gene-phenotype associations
    afactory = AssociationSetFactory()
    ## Load Associations from Monarch. Note the first time this runs Jupyter will show '*' - be patient
    aset = afactory.create(ontology=ont,
                           subject_category='gene',
                           object_category='phenotype',
                           taxon='NCBITaxon:9606')

    ## Run enrichment tests using all classes in ontology
    enr = aset.enrichment_test(subjects=gene_ids,
                               threshold=0.00005,
                               labels=True)

    for r in enr[:20]:
        print("{:8.3g} {} {:40s}".format(r['p'], r['c'], str(r['n'])))
class GenericSimilarity(object):

    def __init__(self) -> None:
        self.associations = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def load_associations(self, taxon):
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                        ontology=self.ontology,
                        subject_category='gene',
                        object_category='phenotype',
                        taxon=taxon_map[taxon]
            )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> float:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, set()

        shared_terms = a1.intersection(a2)
        return len(shared_terms) / num_union, shared_terms

    def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie
                )
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        similarities.append({
                            'input_id': input_gene,
                            'input_symbol': igene['input_symbol'],
                            'hit_symbol': subject_label,
                            'hit_id': subject_curie,
                            'score': score,
                            'shared_terms': shared_terms,
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie):
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene

    @staticmethod
    def sort_results(input_gene_set, results):
        results = pd.DataFrame(results)
        annotated_gene_set = input_gene_set['hit_id'].tolist()
        results = \
            results[~results['hit_id'].isin(annotated_gene_set)]. \
            sort_values('score', ascending=False)
        return results
Esempio n. 17
0
def main():
    """
    Wrapper for OGR Assocs
    """
    parser = argparse.ArgumentParser(
        description='Wrapper for obographs assocmodel library'
        """
                                                 By default, ontologies and assocs are cached locally and synced from a remote sparql endpoint
                                                 """,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-r',
                        '--resource',
                        type=str,
                        required=False,
                        help='Name of ontology')
    parser.add_argument('-f',
                        '--assocfile',
                        type=str,
                        required=False,
                        help='Name of input file for associations')
    parser.add_argument(
        '--assocformat',
        type=str,
        default='gaf',
        required=False,
        help='Format of association file, if passed (default: gaf)')
    parser.add_argument('-o',
                        '--outfile',
                        type=str,
                        required=False,
                        help='Path to output file')
    parser.add_argument('-t',
                        '--to',
                        type=str,
                        required=False,
                        help='Output to (tree, dot, ...)')
    parser.add_argument('-d',
                        '--direction',
                        type=str,
                        default='u',
                        required=False,
                        help='u = up, d = down, ud = up and down')
    parser.add_argument('-e',
                        '--evidence',
                        type=str,
                        required=False,
                        help='ECO class')
    parser.add_argument('-p',
                        '--properties',
                        nargs='*',
                        type=str,
                        required=False,
                        help='Properties')
    parser.add_argument('-P',
                        '--plot',
                        type=bool,
                        default=False,
                        help='if set, plot output (requires plotly)')
    parser.add_argument('-y',
                        '--yamlconfig',
                        type=str,
                        required=False,
                        help='Path to setup/configuration yaml file')
    parser.add_argument('-S',
                        '--slim',
                        type=str,
                        default='',
                        required=False,
                        help='Slim type. m=minimal')
    parser.add_argument('-c',
                        '--container_properties',
                        nargs='*',
                        type=str,
                        required=False,
                        help='Properties to nest in graph')
    parser.add_argument('-C',
                        '--category',
                        nargs=2,
                        type=str,
                        required=False,
                        help='category tuple (SUBJECT OBJECT)')
    parser.add_argument('-T',
                        '--taxon',
                        type=str,
                        required=False,
                        help='Taxon of associations')
    parser.add_argument('-v',
                        '--verbosity',
                        default=0,
                        action='count',
                        help='Increase output verbosity')

    subparsers = parser.add_subparsers(dest='subcommand',
                                       help='sub-command help')

    # EXTRACT ONTOLOGY
    parser_n = subparsers.add_parser(
        'subontology',
        help=
        'Extract sub-ontology, include only annotated nodes or their descendants'
    )
    parser_n.add_argument('-M',
                          '--minimal',
                          dest='minimal',
                          action='store_true',
                          default=False,
                          help='If set, remove non-MRCA nodes')
    parser_n.set_defaults(function=extract_ontology)

    # ENRICHMENT
    parser_n = subparsers.add_parser(
        'enrichment',
        help=
        'Perform an enrichment test over a sample set of annotated entities')
    parser_n.add_argument(
        '-q',
        '--query',
        type=str,
        help='query all genes for this class an use as subject')
    parser_n.add_argument('-H',
                          '--hypotheses',
                          nargs='*',
                          help='list of classes to test against')
    parser_n.add_argument(
        '-s',
        '--sample_file',
        type=str,
        help='file containing list of gene IDs in sample set')
    parser_n.add_argument(
        '-b',
        '--background_file',
        type=str,
        help='file containing list of gene IDs in background set')
    parser_n.add_argument('-t',
                          '--threshold',
                          type=float,
                          help='p-value threshold')
    parser_n.add_argument('sample_ids',
                          nargs='*',
                          help='list of gene IDs in sample set')
    parser_n.set_defaults(function=run_enrichment_test)

    # PHENOLOG
    parser_n = subparsers.add_parser(
        'phenolog',
        help=
        'Perform multiple enrichment tests, using a second ontology and assoc set to build gene sets'
    )
    parser_n.add_argument('-R',
                          '--resource2',
                          type=str,
                          required=True,
                          help='path to second GAF')
    parser_n.add_argument('-F',
                          '--file2',
                          type=str,
                          required=True,
                          help='handle for second ontology')
    parser_n.set_defaults(function=run_phenolog)

    # QUERY
    parser_n = subparsers.add_parser(
        'query',
        help=
        'Query for entities (e.g. genes) based on positive and negative terms')
    parser_n.add_argument('-q', '--query', nargs='*', help='positive classes')
    parser_n.add_argument('-N',
                          '--negative',
                          type=str,
                          help='negative classes')
    parser_n.set_defaults(function=run_query)

    # QUERY ASSOCIATIONS
    parser_n = subparsers.add_parser(
        'associations',
        help='Query for associations for a set of entities (e.g. genes)')
    parser_n.add_argument('subjects', nargs='*', help='subject ids')
    parser_n.add_argument('-D', '--dendrogram', type=bool, default=False)
    parser_n.set_defaults(function=run_query_associations)

    # INTERSECTIONS
    parser_n = subparsers.add_parser('intersections',
                                     help='Query intersections')
    parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes')
    parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('terms', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_intersections)

    # INTERSECTION DENDROGRAM (TODO: merge into previous?)
    parser_n = subparsers.add_parser('intersection-dendrogram',
                                     help='Plot dendrogram from intersections')
    parser_n.add_argument('-X', '--xterms', nargs='*', help='x classes')
    parser_n.add_argument('-Y', '--yterms', nargs='*', help='y classes')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('terms', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_term_intersection_dendrogram)

    # SIMILARITY MATRIX (may move to another module)
    parser_n = subparsers.add_parser(
        'simmatrix', help='Plot dendrogram for similarities between subjects')
    parser_n.add_argument('-X', '--xsubjects', nargs='*', help='x subjects')
    parser_n.add_argument('-Y', '--ysubjects', nargs='*', help='y subjects')
    parser_n.add_argument('--useids',
                          type=bool,
                          default=False,
                          help='if true, use IDs not labels on axes')
    parser_n.add_argument('subjects', nargs='*', help='all terms (x and y)')
    parser_n.set_defaults(function=plot_simmatrix)

    args = parser.parse_args()

    if args.verbosity >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.WARNING)

    if not args.assocfile:
        if not args.taxon or not args.category:
            raise ValueError(
                "Must specify EITHER assocfile OR both taxon and category")

    logging.info("Welcome!")

    if args.yamlconfig is not None:
        logging.info("Setting config from: {}".format(args.yamlconfig))
        # note this sets a global:
        # we would not do this outside the context of a standalone script
        from ontobio.config import set_config
        set_config(args.yamlconfig)

    handle = args.resource

    # Ontology Factory
    ofactory = OntologyFactory()
    logging.info("Creating ont object from: {} {}".format(handle, ofactory))
    ont = ofactory.create(handle)
    logging.info("ont: {}".format(ont))

    evidence = args.evidence
    if evidence is not None and evidence.lower() == 'noiea':
        evidence = "-ECO:0000501"

    # Association Factory
    afactory = AssociationSetFactory()
    aset = None
    if args.assocfile is not None:
        aset = afactory.create_from_file(file=args.assocfile,
                                         fmt=args.assocformat,
                                         ontology=ont)
    else:
        [subject_category, object_category] = args.category
        # create using GO/Monarch services
        aset = afactory.create(ontology=ont,
                               subject_category=subject_category,
                               object_category=object_category,
                               taxon=args.taxon)

    func = args.function
    func(ont, aset, args)
Esempio n. 18
0
import pickle
import pandas
import numpy as np

from ontobio.ontol_factory import OntologyFactory
from ontobio.assoc_factory import AssociationSetFactory



HUMAN = 'NCBITaxon:9606'

#ontology paths
##''/Users/marcin/Documents/VIMSS/ontology/NCATS/HPO/hp.obo')#mondo#hp

ofactory = OntologyFactory()
afactory = AssociationSetFactory()
print("creating...")
ont = ofactory.create('hp')
#ont = ofactory.create('mondo')
aset = afactory.create(ontology=ont,
                       subject_category='disease',
                       object_category='phenotype',
                       taxon=HUMAN)

###aset = afactory.create_from_gaf('my.gaf', ontology=ont)

                
disease_ids = ["DECIPHER:1", "DECIPHER:16", "OMIM:614696", "OMIM:614699", "Orphanet:99978"]
phenotype_ids = ["HP:0000007", "Orphanet:93299", "Orphanet:90794"]
print("annotations\t"+phenotype_ids[1])
Esempio n. 19
0
def test_factory():
    ont = OntologyFactory().create(ONT)
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt='hpoa', file=ANNFILE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 40
Esempio n. 20
0
def main():
    """
    Phenologs
    """
    parser = argparse.ArgumentParser(
        description='Phenologs'
        """
                                                 By default, ontologies are cached locally and synced from a remote sparql endpoint
                                                 """,
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('-r',
                        '--resource1',
                        type=str,
                        required=False,
                        help='Name of ontology1')
    parser.add_argument('-R',
                        '--resource2',
                        type=str,
                        required=False,
                        help='Name of ontology2')
    parser.add_argument('-T',
                        '--taxon',
                        type=str,
                        default='NCBITaxon:10090',
                        required=False,
                        help='NCBITaxon ID')
    parser.add_argument('-s',
                        '--search',
                        type=str,
                        default='',
                        required=False,
                        help='Search type. p=partial, r=regex')
    parser.add_argument('-b',
                        '--background',
                        type=str,
                        default=None,
                        required=False,
                        help='Class to use for background')
    parser.add_argument('-p',
                        '--pthreshold',
                        type=float,
                        default=0.05,
                        required=False,
                        help='P-value threshold')
    parser.add_argument('-v',
                        '--verbosity',
                        default=0,
                        action='count',
                        help='Increase output verbosity')

    parser.add_argument('ids', nargs='*')

    args = parser.parse_args()

    if args.verbosity >= 2:
        logging.basicConfig(level=logging.DEBUG)
    if args.verbosity == 1:
        logging.basicConfig(level=logging.INFO)
    logging.info("Welcome!")

    ofactory = OntologyFactory()
    afactory = AssociationSetFactory()
    handle = args.resource1
    ont1 = ofactory.create(args.resource1)
    ont2 = ofactory.create(args.resource2)
    logging.info("onts: {} {}".format(ont1, ont2))
    searchp = args.search

    category = 'gene'

    aset1 = afactory.create(ontology=ont1,
                            subject_category=category,
                            object_category='phenotype',
                            taxon=args.taxon)
    aset2 = afactory.create(ontology=ont2,
                            subject_category=category,
                            object_category='function',
                            taxon=args.taxon)

    bg_cls = None
    if args.background is not None:
        bg_ids = resolve(ont1, [args.background], searchp)
        if len(bg_ids) == 0:
            logging.error("Cannnot resolve: '{}' using {} in {}".format(
                args.background, searchp, ont1))
            sys.exit(1)
        elif len(bg_ids) > 1:
            logging.error("Multiple matches: '{}' using {} MATCHES={}".format(
                args.background, searchp, bg_ids))
            sys.exit(1)
        else:
            logging.info("Background: {}".format(bg_cls))
            [bg_cls] = bg_ids

    for id in resolve(ont1, args.ids, searchp):

        sample = aset1.query([id], [])
        print("Gene set class:{} Gene set: {}".format(id, sample))
        bg = None
        if bg_cls is not None:
            bg = aset1.query([bg_cls], [])
            print("BACKGROUND SUBJECTS: {}".format(bg))

        rs = aset2.enrichment_test(sample,
                                   bg,
                                   threshold=args.pthreshold,
                                   labels=True)
        print("RESULTS: {} < {}".format(len(rs), args.pthreshold))
        for r in rs:
            print(str(r))
renderer = GraphRenderer.create('obo')
renderer.outfile = './output/wd-ontology.obo'
# renderer.write(wd_ontology)
# >> AttributeError: 'EagerWikidataOntology' object has no attribute 'all_logical_definitions'

renderer.write_subgraph(wd_ontology, nodes, query_ids=qids)

# Get GO terms
outfile = open('./output/go-terms.tsv', 'w')

[ptsd] = wd_ontology.search('Sickle Cell Anemia')
proteins = wd.canned_query('disease2protein', ptsd)

go = onto_factory.create('go')

afactory = AssociationSetFactory()
aset = afactory.create(ontology=go,
                       subject_category='gene',
                       object_category='function',
                       taxon='NCBITaxon:9606')

for n in wd_ontology.nodes():
    proteins = wd.canned_query('disease2protein', n)
    anns = [a for p in proteins for a in aset.annotations(p)]
    if len(anns) > 0:
        print("{} {}".format(n, wd_ontology.label(n)))
        for a in anns:
            outfile.write("{}\t{}\n".format(a, go.label(a)))

# Endpoints
SCIGRAPH_ONTOLOGY = 'https://scigraph-ontology-dev.monarchinitiative.org/scigraph/'
Esempio n. 22
0
class GenericSimilarity(object):
    def __init__(self) -> None:
        self.associations = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def retrieve_associations(self, ont, group):
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(ont)
        p = GafParser()
        url = ''
        if ont == 'go':
            go_roots = set(
                self.ontology.descendants('GO:0008150') +
                self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if group == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if group == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse('goa_human.gaf.gz')
            #assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(
                assocs, ontology=sub_ont)
        else:
            self.associations = self.afactory.create(
                ontology=self.ontology,
                subject_category='gene',
                object_category='phenotype',
                taxon=taxon_map[group])

    def compute_jaccard(self,
                        input_genes: List[dict],
                        lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie)
                if input_gene is not subject_curie:
                    score = jaccard_similarity(self.associations, input_gene,
                                               subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        similarities.append({
                            'input_id':
                            input_gene,
                            'input_symbol':
                            igene['input_symbol'],
                            'hit_symbol':
                            subject_label,
                            'hit_id':
                            subject_curie,
                            'score':
                            score,
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie):
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene
class GenericSimilarity(object):

    def __init__(self) -> None:
        self.associations = None
        self.ont = ''
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()

    def load_associations(self, taxon) -> None:
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. 
            # CX: These are 2 out of 3 top-level terms in GO ontology. 
            # CX: The excluded term is cellular_component (where gene carries out a molecular function)
            go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                        ontology=self.ontology,
                        subject_category='gene',
                        object_category='phenotype',
                        taxon=taxon_map[taxon]
            )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, list()

        shared_terms = a1.intersection(a2)

        # Note: we need to convert the shared_terms set to a list
        # to avoid later JSON serialization problems
        return len(shared_terms) / num_union, list(shared_terms)

    def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene['sim_input_curie'],
                    subject_curie=subject_curie
                )
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if float(score) > float(lower_bound):
                        subject_label = self.associations.label(subject_curie)
                        # CX: addition of human-readable labels aka "shared_term_names" 
                        shared_term_names = [self.associations.label(x) for x in shared_terms]
                        similarities.append({
                            'input_id': input_gene,
                            'input_symbol': igene['input_symbol'],
                            'hit_symbol': subject_label,
                            'hit_id': subject_curie,
                            'score': score,
                            'shared_terms': shared_terms,
                            'shared_term_names': shared_term_names
                        })
        return similarities

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie) -> str:
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene

    @staticmethod
    def sort_results(results) -> pd.DataFrame:

        results = pd.DataFrame(results)

        if not results.empty:
            # CX: Some users need to know the scores that input genes have for each other.
            #     replacing code to remove GeneA input = GeneA output results
            results = \
                results[~(results.hit_id == results.input_id)]. \
                sort_values('score', ascending=False)

        return results
Esempio n. 24
0
def test_remote_go_pombase():
    ont = OntologyFactory().create('go')
    f = AssociationSetFactory()
    aset = f.create(ontology=ont, fmt='gaf', file=POMBASE)
    print("SUBJS: {}".format(aset.subjects))
    assert len(aset.subjects) > 100
Esempio n. 25
0
 def __init__(self) -> None:
     self.associations = ''
     self.ontology = ''
     self.assocs = ''
     self.afactory = AssociationSetFactory()
Esempio n. 26
0
class GenericSimilarity(object):
    # Class level singletons for similarity engines
    _ontology = {}

    # Class level cache for results of Jaccard similarity searches
    _jaccard_similarity_tasks = {}

    @classmethod
    def get_similarity_engine(cls, ontology, taxon):
        """
        Returns a singleton GenericSimilarity instance
        for use in Jaccard similarity computations

        :param ontology: should be 'go', 'hp' or 'mp'
        :param taxon: should be 'human' or 'mouse'
        :return: GenericSimilarity() singleton
        """
        if ontology not in ['go', 'hp', 'mp']:
            raise OntologyServerException(
                "compute_jaccard() ERROR: ontology '" + ontology +
                "' not recognized.")

        if taxon not in ['human', 'mouse']:
            raise OntologyServerException("compute_jaccard() ERROR: taxon '" +
                                          taxon + "' not recognized.")

        if ontology not in cls._ontology:
            cls._ontology[ontology] = {}

        if taxon not in cls._ontology[ontology]:
            cls._ontology[ontology][taxon] = GenericSimilarity(ontology, taxon)

        return cls._ontology[ontology][taxon]

    def __init__(self, ont: str, taxon: str) -> None:
        self.associations = None
        self.ont = ont
        self.taxon = taxon
        self.ontology = ''
        self.assocs = ''
        self.afactory = AssociationSetFactory()
        self.load_associations()

    def load_associations(self) -> None:
        taxon_map = {
            'human': 'NCBITaxon:9606',
            'mouse': 'NCBITaxon:10090',
        }
        ofactory = OntologyFactory()
        self.ontology = ofactory.create(self.ont)
        p = GafParser()
        url = ''
        if self.ont == 'go':
            # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function.
            # CX: These are 2 out of 3 top-level terms in GO ontology.
            # CX: The excluded term is cellular_component (where gene carries out a molecular function)
            go_roots = set(
                self.ontology.descendants('GO:0008150') +
                self.ontology.descendants('GO:0003674'))
            sub_ont = self.ontology.subontology(go_roots)
            if self.taxon == 'mouse':
                url = "http://current.geneontology.org/annotations/mgi.gaf.gz"
            if self.taxon == 'human':
                url = "http://current.geneontology.org/annotations/goa_human.gaf.gz"
            assocs = p.parse(url)
            self.assocs = assocs
            assocs = [x for x in assocs if 'header' not in x.keys()]
            assocs = [x for x in assocs if x['object']['id'] in go_roots]
            self.associations = self.afactory.create_from_assocs(
                assocs, ontology=sub_ont)
        else:
            self.associations = \
                self.afactory.create(
                    ontology=self.ontology,
                    subject_category='gene',
                    object_category='phenotype',
                    taxon=taxon_map[self.taxon]
                )

    @staticmethod
    def jaccard_similarity(aset: AssociationSet, s1: str,
                           s2: str) -> Tuple[float, list]:
        """
        Calculate jaccard index of inferred associations of two subjects

        |ancs(s1) /\ ancs(s2)|
        ---
        |ancs(s1) \/ ancs(s2)|

        """
        a1 = aset.inferred_types(s1)
        a2 = aset.inferred_types(s2)
        num_union = len(a1.union(a2))
        if num_union == 0:
            return 0.0, list()

        shared_terms = a1.intersection(a2)

        # Note: we need to convert the shared_terms set to a list
        # to avoid later JSON serialization problems
        return len(shared_terms) / num_union, list(shared_terms)

    async def compute_jaccard(self,
                              input_genes: List[dict],
                              lower_bound: float = 0.7) -> List[dict]:
        similarities = []
        for index, igene in enumerate(input_genes):
            for subject_curie in self.associations.subject_label_map.keys():
                input_gene = GenericSimilarity.trim_mgi_prefix(
                    input_gene=igene.sim_input_curie,
                    subject_curie=subject_curie)
                if input_gene is not subject_curie:
                    score, shared_terms = \
                        GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie)
                    if score > lower_bound:
                        subject_label = self.associations.label(subject_curie)
                        # CX: addition of human-readable labels aka "shared_term_names"
                        shared_term_names = [
                            self.associations.label(x) for x in shared_terms
                        ]
                        similarities.append({
                            'input_id':
                            input_gene,
                            'input_symbol':
                            igene.input_symbol,
                            'hit_symbol':
                            subject_label,
                            'hit_id':
                            subject_curie,
                            'score':
                            score,
                            'shared_terms':
                            shared_terms,
                            'shared_term_names':
                            shared_term_names
                        })
        return similarities

    async def compute_jaccard_task(self, uuid: str, input_genes: List[dict],
                                   lower_bound: float):
        self._jaccard_similarity_tasks[uuid] = asyncio.create_task(
            self.compute_jaccard(input_genes, lower_bound))

    def compute_jaccard_async(self, input_genes: List[dict],
                              lower_bound: float):
        uuid = str(uuid4())
        asyncio.run(self.compute_jaccard_task(uuid, input_genes, lower_bound))
        return uuid

    @classmethod
    def get_jaccard_similarity_result(cls, computation_id: str):

        if computation_id in cls._jaccard_similarity_tasks:

            jaccard_similarity_task = cls._jaccard_similarity_tasks[
                computation_id]

            # Need to check if the result is ready to return, then return it
            if jaccard_similarity_task.done():

                try:
                    result = jaccard_similarity_task.result()

                except CancelledError:
                    raise JaccardSimilarityResultNotFound

                except InvalidStateError:
                    raise JaccardSimilarityComputationError

                return result

            else:
                raise JaccardSimilarityPending
        else:
            raise JaccardSimilarityResultNotFound

    @staticmethod
    def trim_mgi_prefix(input_gene, subject_curie) -> str:
        if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene
        elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene:
            return input_gene[4:]

        else:
            return input_gene