def fix_disease_id(self, evidence, logger=logging.getLogger(__name__)): disease_id = evidence['disease']['id'] new_disease_id = get_ontology_code_from_url(disease_id) if len(new_disease_id.split('_')) != 2: self.logger.warning("could not recognize disease.id: %s | added anyway" % disease_id) evidence['disease']['id'] = new_disease_id if not new_disease_id: self.logger.warning("No valid disease.id could be found in evidence: %s. Offending disease.id: %s" % ( evidence['id'], disease_id))
def get_extended_evidence(self, evidence): extended_evidence = copy.copy(evidence.evidence) extended_evidence['private'] = dict() # Get generic gene info genes_info = [] pathway_data = dict(pathway_type_code=[], pathway_code=[]) GO_terms = dict(biological_process=[], cellular_component=[], molecular_function=[], ) target_class = dict(level1=[], level2=[]) uniprot_keywords = [] # TODO: handle domains geneid = extended_evidence['target']['id'] # try: gene = self._get_gene_obj(geneid) genes_info = ExtendedInfoGene(gene) if 'facets' in gene._private and 'reactome' in gene._private['facets']: pathway_data['pathway_type_code'].extend(gene._private['facets']['reactome']['pathway_type_code']) pathway_data['pathway_code'].extend(gene._private['facets']['reactome']['pathway_code']) # except Exception: # self.logger.warning("Cannot get generic info for gene: %s" % aboutid) if gene.go: for go in gene.go: go_code, data = go['id'], go['value'] try: category, term = data['term'][0], data['term'][2:] if category == 'P': GO_terms['biological_process'].append(dict(code=go_code, term=term)) elif category == 'F': GO_terms['molecular_function'].append(dict(code=go_code, term=term)) elif category == 'C': GO_terms['cellular_component'].append(dict(code=go_code, term=term)) except: pass if gene.uniprot_keywords: uniprot_keywords = gene.uniprot_keywords if genes_info: extended_evidence["target"][ExtendedInfoGene.root] = genes_info.data if pathway_data['pathway_code']: pathway_data['pathway_type_code'] = list(set(pathway_data['pathway_type_code'])) pathway_data['pathway_code'] = list(set(pathway_data['pathway_code'])) if 'chembl' in gene.protein_classification and gene.protein_classification['chembl']: target_class['level1'].append([i['l1'] for i in gene.protein_classification['chembl'] if 'l1' in i]) target_class['level2'].append([i['l2'] for i in gene.protein_classification['chembl'] if 'l2' in i]) # Get generic efo info # can it happen you get no efo codes but just one disease? all_efo_codes = [] diseaseid = extended_evidence['disease']['id'] efo = self._get_efo_obj(diseaseid) efo_info = ExtendedInfoEFO(efo) if efo_info: for path in efo_info.data['path']: all_efo_codes.extend(path) extended_evidence["disease"][ExtendedInfoEFO.root] = efo_info.data all_efo_codes = list(set(all_efo_codes)) # Get generic eco info try: all_eco_codes = extended_evidence['evidence']['evidence_codes'] try: all_eco_codes.append( get_ontology_code_from_url(extended_evidence['evidence']['gene2variant']['functional_consequence'])) except KeyError: pass ecos_info = [] for eco_id in all_eco_codes: eco = self._get_eco_obj(eco_id) if eco is not None: ecos_info.append(ExtendedInfoECO(eco)) else: self.logger.warning("eco uri %s is not in the ECO LUT so it will not be considered as included", eco_id) if ecos_info: data = [] for eco_info in ecos_info: data.append(eco_info.data) extended_evidence['evidence'][ExtendedInfoECO.root] = data except Exception as e: extended_evidence['evidence'][ExtendedInfoECO.root] = None all_eco_codes = [] # self.logger.exception("Cannot get generic info for eco: %s:"%str(e)) # Add private objects used just for faceting extended_evidence['private']['efo_codes'] = all_efo_codes extended_evidence['private']['eco_codes'] = all_eco_codes extended_evidence['private']['datasource'] = evidence.datasource extended_evidence['private']['datatype'] = evidence.datatype extended_evidence['private']['facets'] = {} if pathway_data['pathway_code']: extended_evidence['private']['facets']['reactome'] = pathway_data if uniprot_keywords: extended_evidence['private']['facets']['uniprot_keywords'] = uniprot_keywords if GO_terms['biological_process'] or \ GO_terms['molecular_function'] or \ GO_terms['cellular_component']: extended_evidence['private']['facets']['go'] = GO_terms if target_class['level1']: extended_evidence['private']['facets']['target_class'] = target_class return Evidence(extended_evidence, self.datasources_to_datatypes)
def fix_evidence(self, evidence): evidence = evidence.evidence fixed = False # fix errors in data here so nobody needs to ask corrections to the data provider # fix missing version in gwas catalog data if 'variant2disease' in evidence: try: float(evidence['evidence']['variant2disease']['provenance_type']['database']['version']) except: evidence['evidence']['variant2disease']['provenance_type']['database']['version'] = '' fixed = True try: float(evidence['evidence']['variant2disease']['provenance_type']['database']['dbxref']['version']) except: evidence['evidence']['variant2disease']['provenance_type']['database']['dbxref']['version'] = '' fixed = True if 'gene2variant' in evidence: try: float(evidence['evidence']['gene2variant']['provenance_type']['database']['version']) except: evidence['evidence']['gene2variant']['provenance_type']['database']['version'] = '' fixed = True try: float(evidence['evidence']['gene2variant']['provenance_type']['database']['dbxref']['version']) except: evidence['evidence']['gene2variant']['provenance_type']['database']['dbxref']['version'] = '' fixed = True # Split EVA in two datasources depending on the datatype if (evidence['sourceID'] == 'eva') and \ (evidence['type'] == 'somatic_mutation'): evidence['sourceID'] = 'eva_somatic' fixed = True # Move genetic_literature to genetic_association if evidence['type'] == 'genetic_literature': evidence['type'] = 'genetic_association' if 'provenance_type' in evidence and \ 'database' in evidence['provenance_type'] and \ 'version' in evidence['provenance_type']['database']: evidence['provenance_type']['database']['version'] = str(evidence['provenance_type']['database']['version']) # Enforce eco-based score for genetic_association evidencestrings if evidence['type'] == 'genetic_association': available_score = None eco_uri = None try: available_score = evidence['evidence']['gene2variant']['resource_score']['value'] except KeyError: if 'resource_score' in evidence['evidence'] and \ 'value' in evidence['evidence']['resource_score']: available_score = evidence['evidence']['resource_score']['value'] try: eco_uri = evidence['evidence']['gene2variant']['functional_consequence'] if 'evidence_codes' in evidence['evidence']: eco_uri = evidence['evidence']['evidence_codes'] except KeyError: if 'evidence_codes' in evidence['evidence']: eco_uri = evidence['evidence']['evidence_codes'][0] eco_uri.rstrip() if eco_uri in self.eco_scores: if 'gene2variant' in evidence['evidence']: if 'resource_score' not in evidence['evidence']['gene2variant']: evidence['evidence']['gene2variant']['resource_score'] = {} evidence['evidence']['gene2variant']['resource_score']['value'] = self.eco_scores[eco_uri] evidence['evidence']['gene2variant']['resource_score']['type'] = 'probability' if available_score != self.eco_scores[eco_uri]: fixed = True else: self.logger.warning("Cannot find a score for eco code %s in evidence id %s" % (eco_uri, evidence['id'])) # Remove identifiers.org from genes and map to ensembl ids self.fix_target_id(evidence, self.available_genes, self.non_reference_genes ) # Remove identifiers.org from cttv activity and target type ids if 'target_type' in evidence['target']: evidence['target']['target_type'] = evidence['target']['target_type'].split('/')[-1] if 'activity' in evidence['target']: evidence['target']['activity'] = evidence['target']['activity'].split('/')[-1] # Remove identifiers.org from efos self.fix_disease_id(evidence) # Remove identifiers.org from ecos new_eco_ids = [] if 'evidence_codes' in evidence['evidence']: eco_ids = evidence['evidence']['evidence_codes'] elif 'variant2disease' in evidence['evidence']: if 'variant2disease' in evidence['evidence']: eco_ids = evidence['evidence']['variant2disease']['evidence_codes'] if 'gene2variant' in evidence['evidence']: eco_ids.extend(evidence['evidence']['gene2variant']['evidence_codes']) elif 'target2drug' in evidence['evidence']: eco_ids = evidence['evidence']['target2drug']['evidence_codes'] eco_ids.extend(evidence['evidence']['drug2clinic']['evidence_codes']) elif 'biological_model' in evidence['evidence']: eco_ids = evidence['evidence']['biological_model']['evidence_codes'] else: eco_ids = [] # something wrong here... eco_ids = list(set(eco_ids)) for idorg_eco_uri in eco_ids: code = get_ontology_code_from_url(idorg_eco_uri.strip()) if code is not None: # if len(code.split('_')) != 2: # self.logger.warning("could not recognize evidence code: %s in id %s | added anyway" %(evidence['id'], # idorg_eco_uri)) new_eco_ids.append(code) evidence['evidence']['evidence_codes'] = list(set(new_eco_ids)) if not new_eco_ids: self.logger.warning("No valid ECO could be found in evidence: %s. original ECO mapping: %s" % ( evidence['id'], str(eco_ids)[:100])) return Evidence(evidence,self.datasources_to_datatypes), fixed