def test_semsearch(): afa = AssociationSetFactory() f = POMBASE ont = OntologyFactory().create(ONT) parser = GafParser() assocs = parser.parse(POMBASE, skipheader=True) assocs = [a for a in assocs if a['subject']['label'] in GENES] aset = afa.create_from_assocs(assocs, ontology=ont) ont = aset.subontology() aset.ontology = ont logging.info('Genes={} Terms={}'.format(len(aset.subjects), len(ont.nodes()))) print('STATS={}'.format(aset.as_dataframe().describe())) #genes = aset.subjects[0:5] sse = SemSearchEngine(assocmodel=aset) logging.info('Calculating all MICAs') sse.calculate_all_micas() #h5path = 'tests/resources/mica_ic.h5' #logging.info('Saving to {}'.format(h5path)) #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w') #logging.info('Saved to {}'.format(h5path)) logging.info('Doing pairwise') for i in aset.subjects: for j in aset.subjects: sim = sse.pw_score_cosine(i, j) #print('{} x {} = {}'.format(i,j,sim)) if i == j: assert (sim > 0.9999) tups = sse.pw_score_resnik_bestmatches(i, j) print('{} x {} = {} // {}'.format(i, j, sim, tups))
def load_associations(self, group): p = GafParser() afactory = AssociationSetFactory() url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format( group) if group == 'human': url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" assocs = p.parse(url) assocs = [x for x in assocs if 'header' not in x.keys()] self.associations = afactory.create_from_assocs(assocs, ontology=self.ontology)
class GenericSimilarity(object): def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def retrieve_associations(self, ont, group): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(ont) p = GafParser() url = '' if ont == 'go': go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if group == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if group == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse('goa_human.gaf.gz') #assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[group]) def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie) if input_gene is not subject_curie: score = jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie): if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene
class GenericSimilarity(object): def __init__(self) -> None: self.associations = None self.ont = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def load_associations(self, taxon) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, list() shared_terms = a1.intersection(a2) # Note: we need to convert the shared_terms set to a list # to avoid later JSON serialization problems return len(shared_terms) / num_union, list(shared_terms) def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie ) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) # CX: addition of human-readable labels aka "shared_term_names" shared_term_names = [self.associations.label(x) for x in shared_terms] similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, 'shared_term_names': shared_term_names }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie) -> str: if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene @staticmethod def sort_results(results) -> pd.DataFrame: results = pd.DataFrame(results) if not results.empty: # CX: Some users need to know the scores that input genes have for each other. # replacing code to remove GeneA input = GeneA output results results = \ results[~(results.hit_id == results.input_id)]. \ sort_values('score', ascending=False) return results
class GenericSimilarity(object): def __init__(self) -> None: self.associations = '' self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() def load_associations(self, taxon): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> float: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, set() shared_terms = a1.intersection(a2) return len(shared_terms) / num_union, shared_terms def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene['sim_input_curie'], subject_curie=subject_curie ) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if float(score) > float(lower_bound): subject_label = self.associations.label(subject_curie) similarities.append({ 'input_id': input_gene, 'input_symbol': igene['input_symbol'], 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, }) return similarities @staticmethod def trim_mgi_prefix(input_gene, subject_curie): if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene @staticmethod def sort_results(input_gene_set, results): results = pd.DataFrame(results) annotated_gene_set = input_gene_set['hit_id'].tolist() results = \ results[~results['hit_id'].isin(annotated_gene_set)]. \ sort_values('score', ascending=False) return results
class GenericSimilarity(object): # Class level singletons for similarity engines _ontology = {} # Class level cache for results of Jaccard similarity searches _jaccard_similarity_tasks = {} @classmethod def get_similarity_engine(cls, ontology, taxon): """ Returns a singleton GenericSimilarity instance for use in Jaccard similarity computations :param ontology: should be 'go', 'hp' or 'mp' :param taxon: should be 'human' or 'mouse' :return: GenericSimilarity() singleton """ if ontology not in ['go', 'hp', 'mp']: raise OntologyServerException( "compute_jaccard() ERROR: ontology '" + ontology + "' not recognized.") if taxon not in ['human', 'mouse']: raise OntologyServerException("compute_jaccard() ERROR: taxon '" + taxon + "' not recognized.") if ontology not in cls._ontology: cls._ontology[ontology] = {} if taxon not in cls._ontology[ontology]: cls._ontology[ontology][taxon] = GenericSimilarity(ontology, taxon) return cls._ontology[ontology][taxon] def __init__(self, ont: str, taxon: str) -> None: self.associations = None self.ont = ont self.taxon = taxon self.ontology = '' self.assocs = '' self.afactory = AssociationSetFactory() self.load_associations() def load_associations(self) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if self.taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if self.taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[self.taxon] ) @staticmethod def jaccard_similarity(aset: AssociationSet, s1: str, s2: str) -> Tuple[float, list]: """ Calculate jaccard index of inferred associations of two subjects |ancs(s1) /\ ancs(s2)| --- |ancs(s1) \/ ancs(s2)| """ a1 = aset.inferred_types(s1) a2 = aset.inferred_types(s2) num_union = len(a1.union(a2)) if num_union == 0: return 0.0, list() shared_terms = a1.intersection(a2) # Note: we need to convert the shared_terms set to a list # to avoid later JSON serialization problems return len(shared_terms) / num_union, list(shared_terms) async def compute_jaccard(self, input_genes: List[dict], lower_bound: float = 0.7) -> List[dict]: similarities = [] for index, igene in enumerate(input_genes): for subject_curie in self.associations.subject_label_map.keys(): input_gene = GenericSimilarity.trim_mgi_prefix( input_gene=igene.sim_input_curie, subject_curie=subject_curie) if input_gene is not subject_curie: score, shared_terms = \ GenericSimilarity.jaccard_similarity(self.associations, input_gene, subject_curie) if score > lower_bound: subject_label = self.associations.label(subject_curie) # CX: addition of human-readable labels aka "shared_term_names" shared_term_names = [ self.associations.label(x) for x in shared_terms ] similarities.append({ 'input_id': input_gene, 'input_symbol': igene.input_symbol, 'hit_symbol': subject_label, 'hit_id': subject_curie, 'score': score, 'shared_terms': shared_terms, 'shared_term_names': shared_term_names }) return similarities async def compute_jaccard_task(self, uuid: str, input_genes: List[dict], lower_bound: float): self._jaccard_similarity_tasks[uuid] = asyncio.create_task( self.compute_jaccard(input_genes, lower_bound)) def compute_jaccard_async(self, input_genes: List[dict], lower_bound: float): uuid = str(uuid4()) asyncio.run(self.compute_jaccard_task(uuid, input_genes, lower_bound)) return uuid @classmethod def get_jaccard_similarity_result(cls, computation_id: str): if computation_id in cls._jaccard_similarity_tasks: jaccard_similarity_task = cls._jaccard_similarity_tasks[ computation_id] # Need to check if the result is ready to return, then return it if jaccard_similarity_task.done(): try: result = jaccard_similarity_task.result() except CancelledError: raise JaccardSimilarityResultNotFound except InvalidStateError: raise JaccardSimilarityComputationError return result else: raise JaccardSimilarityPending else: raise JaccardSimilarityResultNotFound @staticmethod def trim_mgi_prefix(input_gene, subject_curie) -> str: if 'MGI:MGI:' in subject_curie and 'MGI:MGI:' in input_gene: return input_gene elif 'MGI:MGI:' not in subject_curie and 'MGI:MGI:' in input_gene: return input_gene[4:] else: return input_gene