Ejemplo n.º 1
0
 def fix_disease_id(self, evidence, logger=logging.getLogger(__name__)):
     disease_id = evidence['disease']['id']
     new_disease_id = get_ontology_code_from_url(disease_id)
     if len(new_disease_id.split('_')) != 2:
         self.logger.warning("could not recognize disease.id: %s | added anyway" % disease_id)
     evidence['disease']['id'] = new_disease_id
     if not new_disease_id:
         self.logger.warning("No valid disease.id could be found in evidence: %s. Offending disease.id: %s" % (
             evidence['id'], disease_id))
Ejemplo n.º 2
0
    def get_extended_evidence(self, evidence):

        extended_evidence = copy.copy(evidence.evidence)
        extended_evidence['private'] = dict()

        # Get generic gene info
        genes_info = []
        pathway_data = dict(pathway_type_code=[],
                            pathway_code=[])
        GO_terms = dict(biological_process=[],
                        cellular_component=[],
                        molecular_function=[],
                        )
        target_class = dict(level1=[],
                            level2=[])
        uniprot_keywords = []
        # TODO: handle domains
        geneid = extended_evidence['target']['id']
        # try:
        gene = self._get_gene_obj(geneid)
        genes_info = ExtendedInfoGene(gene)
        if 'facets' in gene._private and 'reactome' in gene._private['facets']:
            pathway_data['pathway_type_code'].extend(gene._private['facets']['reactome']['pathway_type_code'])
            pathway_data['pathway_code'].extend(gene._private['facets']['reactome']['pathway_code'])
            # except Exception:
            #     self.logger.warning("Cannot get generic info for gene: %s" % aboutid)
        if gene.go:
            for go in gene.go:
                go_code, data = go['id'], go['value']
                try:
                    category, term = data['term'][0], data['term'][2:]
                    if category == 'P':
                        GO_terms['biological_process'].append(dict(code=go_code,
                                                                   term=term))
                    elif category == 'F':
                        GO_terms['molecular_function'].append(dict(code=go_code,
                                                                   term=term))
                    elif category == 'C':
                        GO_terms['cellular_component'].append(dict(code=go_code,
                                                                   term=term))
                except:
                    pass
        if gene.uniprot_keywords:
            uniprot_keywords = gene.uniprot_keywords

        if genes_info:
            extended_evidence["target"][ExtendedInfoGene.root] = genes_info.data

        if pathway_data['pathway_code']:
            pathway_data['pathway_type_code'] = list(set(pathway_data['pathway_type_code']))
            pathway_data['pathway_code'] = list(set(pathway_data['pathway_code']))
        if 'chembl' in gene.protein_classification and gene.protein_classification['chembl']:
            target_class['level1'].append([i['l1'] for i in gene.protein_classification['chembl'] if 'l1' in i])
            target_class['level2'].append([i['l2'] for i in gene.protein_classification['chembl'] if 'l2' in i])

        # Get generic efo info
        # can it happen you get no efo codes but just one disease?
        all_efo_codes = []
        diseaseid = extended_evidence['disease']['id']
        efo = self._get_efo_obj(diseaseid)
        efo_info = ExtendedInfoEFO(efo)

        if efo_info:
            for path in efo_info.data['path']:
                all_efo_codes.extend(path)
            extended_evidence["disease"][ExtendedInfoEFO.root] = efo_info.data

        all_efo_codes = list(set(all_efo_codes))

        # Get generic eco info
        try:
            all_eco_codes = extended_evidence['evidence']['evidence_codes']
            try:
                all_eco_codes.append(
                    get_ontology_code_from_url(extended_evidence['evidence']['gene2variant']['functional_consequence']))
            except KeyError:
                pass
            ecos_info = []
            for eco_id in all_eco_codes:
                eco = self._get_eco_obj(eco_id)
                if eco is not None:
                    ecos_info.append(ExtendedInfoECO(eco))
                else:
                    self.logger.warning("eco uri %s is not in the ECO LUT so it will not be considered as included", eco_id)

            if ecos_info:
                data = []
                for eco_info in ecos_info:
                    data.append(eco_info.data)
                extended_evidence['evidence'][ExtendedInfoECO.root] = data
        except Exception as e:
            extended_evidence['evidence'][ExtendedInfoECO.root] = None
            all_eco_codes = []
            # self.logger.exception("Cannot get generic info for eco: %s:"%str(e))

        # Add private objects used just for faceting
        extended_evidence['private']['efo_codes'] = all_efo_codes
        extended_evidence['private']['eco_codes'] = all_eco_codes
        extended_evidence['private']['datasource'] = evidence.datasource
        extended_evidence['private']['datatype'] = evidence.datatype
        extended_evidence['private']['facets'] = {}
        if pathway_data['pathway_code']:
            extended_evidence['private']['facets']['reactome'] = pathway_data
        if uniprot_keywords:
            extended_evidence['private']['facets']['uniprot_keywords'] = uniprot_keywords
        if GO_terms['biological_process'] or \
                GO_terms['molecular_function'] or \
                GO_terms['cellular_component']:
            extended_evidence['private']['facets']['go'] = GO_terms

        if target_class['level1']:
            extended_evidence['private']['facets']['target_class'] = target_class

        return Evidence(extended_evidence, self.datasources_to_datatypes)
Ejemplo n.º 3
0
    def fix_evidence(self, evidence):

        evidence = evidence.evidence
        fixed = False

        # fix errors in data here so nobody needs to ask corrections to the data provider
        # fix missing version in gwas catalog data
        if 'variant2disease' in evidence:
            try:
                float(evidence['evidence']['variant2disease']['provenance_type']['database']['version'])
            except:
                evidence['evidence']['variant2disease']['provenance_type']['database']['version'] = ''
                fixed = True
            try:
                float(evidence['evidence']['variant2disease']['provenance_type']['database']['dbxref']['version'])
            except:
                evidence['evidence']['variant2disease']['provenance_type']['database']['dbxref']['version'] = ''
                fixed = True
        if 'gene2variant' in evidence:
            try:
                float(evidence['evidence']['gene2variant']['provenance_type']['database']['version'])
            except:
                evidence['evidence']['gene2variant']['provenance_type']['database']['version'] = ''
                fixed = True
            try:
                float(evidence['evidence']['gene2variant']['provenance_type']['database']['dbxref']['version'])
            except:
                evidence['evidence']['gene2variant']['provenance_type']['database']['dbxref']['version'] = ''
                fixed = True
        # Split EVA in two datasources depending on the datatype
        if (evidence['sourceID'] == 'eva') and \
                (evidence['type'] == 'somatic_mutation'):
            evidence['sourceID'] = 'eva_somatic'
            fixed = True
        # Move genetic_literature to genetic_association
        if evidence['type'] == 'genetic_literature':
            evidence['type'] = 'genetic_association'

        if 'provenance_type' in evidence and \
                        'database' in evidence['provenance_type'] and \
                        'version' in evidence['provenance_type']['database']:
            evidence['provenance_type']['database']['version'] = str(evidence['provenance_type']['database']['version'])

        # Enforce eco-based score for genetic_association evidencestrings
        if evidence['type'] == 'genetic_association':
            available_score = None
            eco_uri = None
            try:
                available_score = evidence['evidence']['gene2variant']['resource_score']['value']
            except KeyError:
                if 'resource_score' in evidence['evidence'] and \
                                'value' in evidence['evidence']['resource_score']:
                    available_score = evidence['evidence']['resource_score']['value']
            try:
                eco_uri = evidence['evidence']['gene2variant']['functional_consequence']
                if 'evidence_codes' in evidence['evidence']:
                    eco_uri = evidence['evidence']['evidence_codes']
            except KeyError:
                if 'evidence_codes' in evidence['evidence']:
                    eco_uri = evidence['evidence']['evidence_codes'][0]
                    eco_uri.rstrip()

            if eco_uri in self.eco_scores:
                if 'gene2variant' in evidence['evidence']:
                    if 'resource_score' not in evidence['evidence']['gene2variant']:
                        evidence['evidence']['gene2variant']['resource_score'] = {}
                        evidence['evidence']['gene2variant']['resource_score']['value'] = self.eco_scores[eco_uri]
                        evidence['evidence']['gene2variant']['resource_score']['type'] = 'probability'
                        if available_score != self.eco_scores[eco_uri]:
                            fixed = True
            else:
                self.logger.warning("Cannot find a score for eco code %s in evidence id %s" % (eco_uri, evidence['id']))

        # Remove identifiers.org from genes and map to ensembl ids
        self.fix_target_id(evidence, self.available_genes, self.non_reference_genes )

        # Remove identifiers.org from cttv activity  and target type ids
        if 'target_type' in evidence['target']:
            evidence['target']['target_type'] = evidence['target']['target_type'].split('/')[-1]
        if 'activity' in evidence['target']:
            evidence['target']['activity'] = evidence['target']['activity'].split('/')[-1]

        # Remove identifiers.org from efos
        self.fix_disease_id(evidence)

        # Remove identifiers.org from ecos
        new_eco_ids = []
        if 'evidence_codes' in evidence['evidence']:
            eco_ids = evidence['evidence']['evidence_codes']
        elif 'variant2disease' in evidence['evidence']:
            if 'variant2disease' in evidence['evidence']:
                eco_ids = evidence['evidence']['variant2disease']['evidence_codes']
            if 'gene2variant' in evidence['evidence']:
                eco_ids.extend(evidence['evidence']['gene2variant']['evidence_codes'])
        elif 'target2drug' in evidence['evidence']:
            eco_ids = evidence['evidence']['target2drug']['evidence_codes']
            eco_ids.extend(evidence['evidence']['drug2clinic']['evidence_codes'])
        elif 'biological_model' in evidence['evidence']:
            eco_ids = evidence['evidence']['biological_model']['evidence_codes']
        else:
            eco_ids = []  # something wrong here...
        eco_ids = list(set(eco_ids))
        for idorg_eco_uri in eco_ids:
            code = get_ontology_code_from_url(idorg_eco_uri.strip())
            if code is not None:
                # if len(code.split('_')) != 2:
                # self.logger.warning("could not recognize evidence code: %s in id %s | added anyway" %(evidence['id'],
                # idorg_eco_uri))
                new_eco_ids.append(code)
        evidence['evidence']['evidence_codes'] = list(set(new_eco_ids))
        if not new_eco_ids:
            self.logger.warning("No valid ECO could be found in evidence: %s. original ECO mapping: %s" % (
                evidence['id'], str(eco_ids)[:100]))

        return Evidence(evidence,self.datasources_to_datatypes), fixed