def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in xrefs.strip().split('|'):
            prefix = ':'.join(dbxref.split(':')[:-1]).strip()
            if prefix in self.localtt:
                prefix = self.localtt[prefix]
            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))

            if dbxref_curie is not None and prefix != '':
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(
                        gene_id, self.globaltt['has gene product'], dbxref_curie)
                    continue
                    # skip some of these for now based on curie prefix
                if prefix in filter_out:
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    if dbxref_curie in self.omim_replaced:
                        repl = self.omim_replaced[dbxref_curie]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = omim
                    if dbxref_curie in self.omim_type and \
                            self.omim_type[dbxref_curie] != self.globaltt['gene']:
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)
Exemple #2
0
    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
        taxon_spec_filters = {
            '10090': ['ENSEMBL']
        }
        if taxon in taxon_spec_filters:
            filter_out += taxon_spec_filters[taxon]

        model = Model(graph)
        # deal with the xrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
        for ref in xrefs.strip().split('|'):
            xref_curie = self._cleanup_id(ref)
            if xref_curie is not None and xref_curie.strip() != '':
                if re.match(r'HPRD', xref_curie):
                    # proteins are not == genes.
                    model.addTriple(
                        gene_id,
                        self.properties['has_gene_product'], xref_curie)
                    continue
                    # skip some of these for now
                if xref_curie.split(':')[0] in filter_out:
                    continue
                if re.match(r'^OMIM', xref_curie):
                    if DipperUtil.is_omim_disease(xref_curie):
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(
                            gene_id, xref_curie)
                        if int(taxon) in clique_map:
                            if clique_map[int(taxon)] == xref_curie.split(':')[0]:
                                model.makeLeader(xref_curie)
                            elif clique_map[int(taxon)] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, xref_curie)
                except AssertionError as e:
                    logger.warn("Error parsing {0}: {1}".format(gene_id, e))
        return
Exemple #3
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append(
                {
                    'id': self.apo_term_id[exp_type],
                    'term': exp_type,
                })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality': False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_')
            )
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
            assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation,
                                                     object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(
                self.graph, references[0],
                Reference.ref_types['publication']
            )
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return
Exemple #4
0
    def _build_gene_disease_model(
            self,
            gene_id,
            relation_id,
            disease_id,
            variant_label,
            consequence_predicate=None,
            consequence_id=None,
            allelic_requirement=None,
            pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode,
                            consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode,
                                       variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(
            self.graph, self.name, variant_or_gene, disease_id, relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(
                assoc.assoc_id, self.globaltt['has_allelic_requirement'],
                allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Exemple #5
0
 def make_triples(self, source, package):
     model = Model(self.graph)
     if source == 'drugbank':
         for target in package['targets']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=target['action'],
                 obj=target['uniprot'])
             model.addLabel(subject_id=target['uniprot'], label=target['name'])
             model.addTriple(
                 subject_id=target['uniprot'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['polypeptide'])
             model.addTriple(
                 subject_id=package['drugbank_id'],
                 predicate_id=self.globaltt['equivalent_class'],
                 obj=package['unii'])
             model.addTriple(
                 subject_id=target['action'],
                 predicate_id=self.globaltt['subPropertyOf'],
                 obj=self.globaltt['molecularly_interacts_with'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
     if source == 'drugcentral':
         for indication in package['indications']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['is substance that treats'],
                 obj=indication['snomed_id'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
             model.addTriple(
                 subject_id=indication['snomed_id'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['disease'])
             model.addLabel(
                 subject_id=indication['snomed_id'], label=indication['snomed_name'])
         for interaction in package['interactions']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['molecularly_interacts_with'],
                 obj=interaction['uniprot'])
             # model.addLabel(
             #    subject_id=interaction['uniprot'],
             #    label='Protein_{}'.format(interaction['uniprot']))
             model.addLabel(
                 subject_id=interaction['uniprot'], label=interaction['target_name'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
             model.addDescription(
                 subject_id=interaction['uniprot'],
                 description=interaction['target_class'])
             model.addTriple(
                 subject_id=interaction['uniprot'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['polypeptide'])
     return
Exemple #6
0
    def make_triples(self, source, package):
        model = Model(self.graph)
        if source == 'drugbank':
            for target in package['targets']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id=target['action'],
                                obj=target['uniprot'])
                model.addLabel(subject_id=target['uniprot'], label=target['name'])
                model.addTriple(subject_id=target['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')
                model.addTriple(subject_id=package['drugbank_id'],
                                predicate_id=Model.object_properties['equivalent_class'],
                                obj=package['unii'])
                model.addTriple(subject_id=target['action'],
                                predicate_id='rdfs:subPropertyOf',
                                obj='RO:0002436')
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
        if source == 'drugcentral':
            for indication in package['indications']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id='RO:0002606',
                                obj=indication['snomed_id'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addTriple(subject_id=indication['snomed_id'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='DOID:4')
                model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name'])
            for interaction in package['interactions']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id='RO:0002436',
                                obj=interaction['uniprot'])
                # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot']))
                model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class'])
                model.addTriple(subject_id=interaction['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')


        return
Exemple #7
0
    def process_gaf(self, gaffile, limit, id_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter='\t',
                                quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None:
                        # try/except much faster than checking
                        # for dict key membership
                        try:
                            gene_id = id_map[gene_num]
                            uniprotid = ':'.join((dbase, gene_num))
                            (dbase, gene_num) = gene_id.split(':')
                            uniprot_hit += 1
                        except KeyError:
                            # LOG.warning(
                            #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                            #    gene_num)
                            uniprot_miss += 1
                            continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addLabel(gene_id, gene_symbol)
                model.addType(gene_id, self.globaltt['gene'])

                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(gene_id,
                                            self.globaltt['has gene product'],
                                            syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None and\
                                syn.split(':')[0] not in self.wont_prefix:
                            syn = syn.strip()
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            if syn != '':
                                model.addSynonym(gene_id, syn)
                        elif syn != '':
                            model.addSynonym(gene_id, syn)

                # First taxon is for the gene, after the pipe are interacting taxa
                tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon')
                # this is a required field but good to safe
                if tax_curie:
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = self.gaf_eco[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[-2]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning("Skipping  %s from or with %s",
                                        uniprotid, itm)
                            continue
                        # sanity check/conversion on go curie prefix
                        (pfx, lclid) = itm.split(':')[-2:]  # last prefix wins
                        if pfx in self.localtt:
                            pfx = self.localtt[pfx]
                        itm = ':'.join((pfx, lclid))

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm,
                                             phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[-2]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the mapping download",
                uniprot_per, uniprot_tot)
Exemple #8
0
    def _add_study_provenance(
            self,
            phenotyping_center,
            colony,
            project_fullname,
            pipeline_name,
            pipeline_stable_id,
            procedure_stable_id,
            procedure_name,
            parameter_stable_id,
            parameter_name,
            statistical_method,
            resource_name,
            row_num
    ):
        """
        :param phenotyping_center: str, from self.files['all']
        :param colony: str, from self.files['all']
        :param project_fullname: str, from self.files['all']
        :param pipeline_name: str, from self.files['all']
        :param pipeline_stable_id: str, from self.files['all']
        :param procedure_stable_id: str, from self.files['all']
        :param procedure_name: str, from self.files['all']
        :param parameter_stable_id: str, from self.files['all']
        :param parameter_name: str, from self.files['all']
        :param statistical_method: str, from self.files['all']
        :param resource_name: str, from self.files['all']
        :return: study bnode
        """

        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add provenance
        # A study is a blank node equal to its parts
        study_bnode = self.make_id("{0}{1}{2}{3}{4}{5}{6}{7}".format(
            phenotyping_center, colony, project_fullname, pipeline_stable_id,
            procedure_stable_id, parameter_stable_id, statistical_method,
            resource_name), '_')

        model.addIndividualToGraph(
            study_bnode, None, self.globaltt['study'])

        # List of nodes linked to study with has_part property
        study_parts = []

        # Add study parts
        model.addIndividualToGraph(self.resolve(procedure_stable_id), procedure_name)
        study_parts.append(self.resolve(procedure_stable_id))

        study_parts.append(self.resolve(statistical_method))
        provenance_model.add_study_parts(study_bnode, study_parts)

        # Add parameter/measure statement: study measures parameter
        parameter_label = "{0} ({1})".format(parameter_name, procedure_name)

        logging.info("Adding Provenance")
        model.addIndividualToGraph(
            self.resolve(parameter_stable_id), parameter_label)
        provenance_model.add_study_measure(
            study_bnode, self.resolve(parameter_stable_id))

        # Add Colony
        colony_bnode = self.make_id("{0}".format(colony), '_')
        model.addIndividualToGraph(colony_bnode, colony)

        # Add study agent
        model.addIndividualToGraph(
            self.resolve(phenotyping_center), phenotyping_center,
            self.globaltt['organization'])

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['has_agent'], self.resolve(phenotyping_center))

        # add pipeline and project
        model.addIndividualToGraph(
            self.resolve(pipeline_stable_id), pipeline_name)

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['part_of'], self.resolve(pipeline_stable_id))

        model.addIndividualToGraph(
            self.resolve(project_fullname), project_fullname, self.globaltt['project'])

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['part_of'], self.resolve(project_fullname))

        return study_bnode
Exemple #9
0
    def _add_study_provenance(
            self,
            phenotyping_center,
            colony,
            project_fullname,
            pipeline_name,
            pipeline_stable_id,
            procedure_stable_id,
            procedure_name,
            parameter_stable_id,
            parameter_name,
            statistical_method,
            resource_name,
            row_num
    ):
        """
        :param phenotyping_center: str, from self.files['all']
        :param colony: str, from self.files['all']
        :param project_fullname: str, from self.files['all']
        :param pipeline_name: str, from self.files['all']
        :param pipeline_stable_id: str, from self.files['all']
        :param procedure_stable_id: str, from self.files['all']
        :param procedure_name: str, from self.files['all']
        :param parameter_stable_id: str, from self.files['all']
        :param parameter_name: str, from self.files['all']
        :param statistical_method: str, from self.files['all']
        :param resource_name: str, from self.files['all']
        :return: study bnode
        """

        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add provenance
        # A study is a blank node equal to its parts
        study_bnode = self.make_id("{0}{1}{2}{3}{4}{5}{6}{7}".format(
            phenotyping_center, colony, project_fullname, pipeline_stable_id,
            procedure_stable_id, parameter_stable_id, statistical_method,
            resource_name), '_')

        model.addIndividualToGraph(
            study_bnode, None, self.globaltt['study'])

        # List of nodes linked to study with has_part property
        study_parts = []

        # Add study parts
        model.addIndividualToGraph(self.resolve(procedure_stable_id), procedure_name)
        study_parts.append(self.resolve(procedure_stable_id))

        study_parts.append(self.resolve(statistical_method))
        provenance_model.add_study_parts(study_bnode, study_parts)

        # Add parameter/measure statement: study measures parameter
        parameter_label = "{0} ({1})".format(parameter_name, procedure_name)

        logging.info("Adding Provenance")
        model.addIndividualToGraph(
            self.resolve(parameter_stable_id), parameter_label)
        provenance_model.add_study_measure(
            study_bnode, self.resolve(parameter_stable_id))

        # Add Colony
        colony_bnode = self.make_id("{0}".format(colony), '_')
        model.addIndividualToGraph(colony_bnode, colony)

        # Add study agent
        model.addIndividualToGraph(
            self.resolve(phenotyping_center), phenotyping_center,
            self.globaltt['organization'])

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['has_agent'], self.resolve(phenotyping_center))

        # add pipeline and project
        model.addIndividualToGraph(
            self.resolve(pipeline_stable_id), pipeline_name)

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['part_of'], self.resolve(pipeline_stable_id))

        model.addIndividualToGraph(
            self.resolve(project_fullname), project_fullname, self.globaltt['project'])

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['part_of'], self.resolve(project_fullname))

        return study_bnode
Exemple #10
0
class Dataset:
    """
     This class produces metadata about a dataset that is compliant with the
     HCLS dataset specification:
     https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4

     Summary level: The summary level provides a description of a dataset that is
     independent of a specific version or format. (e.g. the Monarch ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER]

     Version level: The version level captures version-specific characteristics of a
     dataset. (e.g. the 01-02-2018 ingest of CTD)
     CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP]

     Distribution level: The distribution level captures metadata about a specific form
     and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is
     a [distribution level resource] for each different downloadable file we emit,
     i.e. one for the TTL file, one for the ntriples file, etc.
     CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt
     or
     MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format]

     We write out at least the following triples:

     SUMMARY LEVEL TRIPLES:
     [summary level resource] - rdf:type -> dctypes:Dataset
     [summary level resource] - dct:title -> title (literal)
     [summary level resource] - dct:description -> description (literal)
                                                (use docstring from Source class)
     [summary level resource] - dcterms:source -> [source web page, e.g. omim.org]
     [summary level resource] - schema:logo -> [source logo IRI]
     [summary level resource] - dct:publisher -> monarchinitiative.org
        n.b: about summary level resource triples:
        -- HCLS spec says we "should" link to our logo and web page, but I'm not,
        because it would confuse the issue of whether we are pointing to our logo/page
        or the logo/page of the data source for this ingest. Same below for
        [version level resource] and [distibution level resource] - I'm not linking to
        our page/logo down there either.
        - spec says we "should" include summary level triples describing Update
        frequency and SPARQL endpoint but I'm omitting this for now, because these are
        not clearly defined at the moment

     VERSION LEVEL TRIPLES:
     [version level resource] - rdf:type -> dctypes:Dataset
     [version level resource] - dct:title -> version title (literal)
     [version level resource] - dct:description -> version description (literal)
     [version level resource] - dct:created -> ingest timestamp [ISO 8601 compliant]
     [version level resource] - pav:version -> ingest timestamp (same one above)
     [version level resource] - dct:creator	-> monarchinitiative.org
     [version level resource] - dct:publisher -> monarchinitiative.org
     [version level resource] - dct:isVersionOf -> [summary level resource]
     [version level resource] - dcterms:source -> [source file 1 IRI]
     [version level resource] - dcterms:source -> [source file 2 IRI]
     ...

     [source file 1 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     [source file 2 IRI] - pav:retrievedOn -> [download date timestamp]
     [source file 2 IRI] - pav:version -> [source version (if set, optional)]
     ...

     [version level resource] - pav:createdWith -> [Dipper github URI]
     [version level resource] - void:dataset -> [distribution level resource]

     [version level resource] - cito:citesAsAuthoriy -> [citation id 1]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 2]
     [version level resource] - cito:citesAsAuthoriy -> [citation id 3]

        n.b: about version level resource triples:
        - spec says we "should" include Date of issue/dct:issued triple, but I'm not
        because it is redundant with this triple above:
        [version level resource] - dct:created -> time stamp
        and would introduce ambiguity and confusion if the two disagree. Same below
        for [distribution level resource] - dct:created -> tgiime stamp below
        Also omitting:
          - triples linking to our logo and page, see above.
          - License/dct:license triple, because we will make this triple via the
            [distribution level resource] below
          - Language/dct:language triple b/c it seems superfluous. Same below for
            [distribution level resource] - no language triple.
        - [version level resource] - pav:version triple is also a bit redundant
        with the pav:version triple below, but the spec requires both these triples
        - I'm omitting the [version level resource] -> pav:previousVersion because
        Dipper doesn't know this info for certain at run time. Same below for
        [distribution level resource] - pav:previousVersion.


     DISTRIBUTION LEVEL TRIPLES:
     [distribution level resource] - rdf:type -> dctypes:Dataset
     [distribution level resource] - rdf:type -> dcat:Distribution
     [distribution level resource] - dct:title -> distribution title (literal)
     [distribution level resource] - dct:description -> distribution description (lit.)
     [distribution level resource] - dct:created -> ingest timestamp[ISO 8601 compliant]
     [distribution level resource] - pav:version -> ingest timestamp (same as above)
     [distribution level resource] - dct:creator -> monarchinitiative.org
     [distribution level resource] - dct:publisher -> monarchinitiative.org
     [distribution level resource] - dct:license -> [license info, if available
                    otherwise indicate unknown]
     [distribution level resource] - dcterms:rights -> [data rights IRI]
     [distribution level resource] - pav:createdWith -> [Dipper github URI]
     [distribution level resource] - dct:format -> [IRI of ttl|nt|whatever spec]
     [distribution level resource] - dct:downloadURL -> [ttl|nt URI]
     [distribution level resource] - void:triples -> [triples count (literal)]
     [distribution level resource] - void:entities -> [entities count (literal)]
     [distribution level resource] - void:distinctSubjects -> [subject count (literal)]
     [distribution level resource] - void:distinctObjects -> [object count (literal)]
     [distribution level resource] - void:properties -> [properties count (literal)]
     ...

        n.b: about distribution level resource triples:
        - omitting Vocabularies used/void:vocabulary and Standards
        used/dct:conformTo triples, because they are described in the ttl file
        - also omitting Example identifier/idot:exampleIdentifier and
        Example resource/void:exampleResource, because we don't really have one
        canonical example of either - they're all very different.
        - [distribution level resource] - dct:created should have the exact same
        time stamp as this triple above:
        [version level resource] - dct:created -> time stamp
        - this [distribution level resource] - pav:version triple should have the
        same object as [version level resource] - pav:version triple above
        - Data source provenance/dct:source triples are above in the
        [version level resource]
        - omitting Byte size/dct:byteSize, RDF File URL/void:dataDump, and
        Linkset/void:subset triples because they probably aren't necessary for MI right
        now
        - these triples "should" be emitted, but we will do this in a later iteration:
        # of classes	void:classPartition	IRI
        # of literals	void:classPartition	IRI
        # of RDF graphs	void:classPartition	IRI

     Note: Do not use blank nodes in the dataset graph. This dataset graph is added to
     the main Dipper graph in Source.write() like so

        $ mainGraph = mainGraph + datasetGraph

     which apparently in theory could lead to blank node ID collisions between the two
     graphs.

     Note also that this implementation currently does not support producing metadata
     for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is
     currently not being used for any ingests, so this isn't a problem. There was
     talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which
     would probably require adding support here for StreamedGraph's.
    """
    def __init__(
            self,
            identifier,
            data_release_version,
            ingest_name,
            ingest_title,
            ingest_url,
            ingest_logo=None,
            ingest_description=None,
            license_url=None,
            data_rights=None,
            graph_type='rdf_graph',  # rdf_graph, streamed_graph
            file_handle=None,
            distribution_type='ttl',
            dataset_curie_prefix='MonarchArchive'):

        if graph_type is None:
            self.graph = RDFGraph(None,
                                  ":".join([dataset_curie_prefix, identifier]))
        elif graph_type == 'streamed_graph':
            self.graph = StreamedGraph(True,
                                       ":".join(
                                           [dataset_curie_prefix, identifier]),
                                       file_handle=file_handle)
        elif graph_type == 'rdf_graph':
            self.graph = RDFGraph(True,
                                  ':'.join([dataset_curie_prefix, identifier]))

        if data_release_version is not None:
            self.data_release_version = data_release_version
        else:
            self.data_release_version = datetime.today().strftime("%Y%m%d")

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.identifier = ':'.join([dataset_curie_prefix, identifier])
        self.citation = set()

        self.ingest_name = ingest_name
        self.ingest_title = ingest_title
        if self.ingest_title is None:
            self.ingest_title = ":".join([dataset_curie_prefix, identifier])

        self.ingest_url = ingest_url
        self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo
        self.ingest_description = ingest_description

        self.date_issued = None

        self.license_url = license_url
        self.data_rights = data_rights
        self.distribution_type = distribution_type

        # set HCLS resource CURIEs
        self.summary_level_curie = ':'.join(
            [dataset_curie_prefix, '#' + identifier])
        self.version_level_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/#' + identifier
        self.distribution_level_turtle_curie = \
            dataset_curie_prefix + ':' + \
            self.data_release_version + \
            '/rdf/' + \
            identifier + "." + self.distribution_type

        # The following might seem a little odd, but we need to set downloadURLs this
        # way in order for them to point to where they will end up in archive.MI.org as
        # of Sept 2019. URL is:
        #  https://archive.MI.org/[release version]/[dist type]/[source].[dist type]
        self.download_url = \
            self.curie_map.get("MonarchArchive") + self.data_release_version + \
            "/rdf/" + self.ingest_name + "." + self.distribution_type

        self._set_summary_level_triples()
        self._set_version_level_triples()
        self._set_distribution_level_triples()

    def _set_summary_level_triples(self):
        self.model.addType(self.summary_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(self.summary_level_curie, self.globaltt['title'],
                             self.ingest_title, True)
        self.model.addTriple(self.summary_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))
        self.model.addTriple(self.summary_level_curie, "schema:logo",
                             self.ingest_logo)
        self.graph.addTriple(self.summary_level_curie,
                             self.globaltt['identifier'],
                             self.summary_level_curie)
        if self.ingest_url is not None:
            self.graph.addTriple(self.summary_level_curie,
                                 self.globaltt["Source"], self.ingest_url)
        if self.ingest_description is not None:
            self.model.addDescription(self.summary_level_curie,
                                      self.ingest_description)

    def _set_version_level_triples(self):
        self.model.addType(self.version_level_curie, self.globaltt['Dataset'])
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['title'],
            self.ingest_title + " Monarch version " +
            self.data_release_version, True)
        if self.ingest_description is not None:
            self.model.addDescription(self.version_level_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(
            self.version_level_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['isVersionOf'],
                             self.summary_level_curie,
                             object_is_literal=False)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['distribution'],
                             self.distribution_level_turtle_curie,
                             object_is_literal=False)

    def _set_distribution_level_triples(self):
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['Dataset'])
        self.model.addType(self.distribution_level_turtle_curie,
                           self.globaltt['distribution'])
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['title'],
            self.ingest_title + " distribution " + self.distribution_type,
            True)
        if self.ingest_description is not None:
            self.model.addDescription(self.distribution_level_turtle_curie,
                                      self.ingest_description)
        self.graph.addTriple(
            self.distribution_level_turtle_curie, self.globaltt['version'],
            Literal(self.data_release_version, datatype=XSD.date))
        self.graph.addTriple(
            self.distribution_level_turtle_curie,
            self.globaltt['Date Created'],
            Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date))
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['creator'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['Publisher'],
                             self.curie_map.get(""))  # eval's to MI.org
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['created_with'],
                             "https://github.com/monarch-initiative/dipper")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['format'],
                             "https://www.w3.org/TR/turtle/")
        self.graph.addTriple(self.distribution_level_turtle_curie,
                             self.globaltt['downloadURL'], self.download_url)
        if self.license_url is None:
            self.graph.addTriple(
                self.distribution_level_turtle_curie, self.globaltt['license'],
                'https://project-open-data.cio.gov/unknown-license/')
        else:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['license'], self.license_url)

        if self.data_rights is not None:
            self.graph.addTriple(self.distribution_level_turtle_curie,
                                 self.globaltt['rights'], self.data_rights)

        self._declare_as_ontology()

    def set_ingest_source_file_version_num(self, file_iri, version):
        """
        This method sets the version of a remote file or resource that is used in the
        ingest. It writes this triple:

        file_iri - 'pav:version' -> version

        Version is an untyped literal

        Note: if your version is a date or timestamp, use
        set_ingest_source_file_version_date()
        instead

        :param file_iri: a remote file or resource used in ingest
        :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD)
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             version,
                             object_is_literal=True)

    def set_ingest_source_file_version_date(self,
                                            file_iri,
                                            date,
                                            datatype=XSD.date):
        """
        This method sets the version that the source (OMIM, CTD, whatever) uses to
        refer to this version of the remote file/resource that was used in the ingest

        It writes this triple:

        file_iri - 'pav:version' -> date or timestamp

        Version is added as a literal of datatype XSD date

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['version'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source_file_version_retrieved_on(self,
                                                    file_iri,
                                                    date,
                                                    datatype=XSD.date):
        """
        This method sets the date on which a remote file/resource (from OMIM, CTD, etc)
        was retrieved.

        It writes this triple:

        file_iri - 'pav:retrievedOn' -> date or timestamp

        Version is added as a literal of datatype XSD date by default

        Note: if file_iri was retrieved using get_files(), then the following triple
        was created and you might not need this method:

        file_iri - 'pav:retrievedOn' -> download date

        :param file_iri: a remote file or resource used in ingest
        :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can
        add timestamp as a version by using a different datatype (below)
        :param datatype: an XSD literal datatype, default is XSD.date
        uses to refer to this version of the file/resource used during the ingest
        :return: None
        """
        self.graph.addTriple(file_iri,
                             self.globaltt['retrieved_on'],
                             date,
                             object_is_literal=True,
                             literal_type=datatype)

    def set_ingest_source(self, url, predicate=None, is_object_literal=False):
        """
        This method writes a triple to the dataset graph indicating that the ingest
        used a file or resource at [url] during the ingest.

        Triple emitted is version_level_curie dcterms:source [url]

        This triple is likely to be redundant if Source.get_files() is used to retrieve
        the remote files/resources, since this triple should also be emitted
        as files/resources are being retrieved. This method is provided as a convenience
        method for sources that do their own downloading of files.

        :param url: a remote resource used as a source during ingest
        :param predicate: the predicate to use for the triple ["dcterms:source"]
                from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/)
                "Use dct:source when the source dataset was used in whole or in part.
                Use pav:retrievedFrom when the source dataset was used in whole and was
                not modified from its original distribution. Use prov:wasDerivedFrom
                when the source dataset was in whole or in part and was modified from
                its original distribution."
        :return: None
        """
        if predicate is None:
            predicate = self.globaltt["Source"]
        self.graph.addTriple(self.version_level_curie,
                             predicate,
                             url,
                             object_is_literal=is_object_literal,
                             subject_category=blv.terms['DataSetVersion'])

    def get_graph(self):
        """
        This method returns the dataset graph
        :param
        :return: dataset graph
        """
        return self.graph

    def get_license(self):
        """
        This method returns the license info
        :param
        :return: license info
        """
        return self.license_url

    def set_citation(self, citation_id):
        """
        This method adds [citaton_id] argument to the set of citations, and also
        adds a triple indicating that version level cito:citesAsAuthority [citation_id]
        :param: citation_id
        :return: none
        """
        self.citation.add(citation_id)
        self.graph.addTriple(self.version_level_curie,
                             self.globaltt['citesAsAuthority'], citation_id)

    def _declare_as_ontology(self, version_info=None):
        """
        Declare the distribution level IRI as an ontology, and also make triple
        distribution level IRI - version_iri -> version level IRI

        TEC: I am not convinced dipper reformatting external data as RDF triples
        makes an OWL ontology (nor that it should be considered a goal).

        Proper ontologies are built by ontologists. Dipper reformats data
        and annotates/decorates it with a minimal set of carefully arranged
        terms drawn from from multiple proper ontologies.
        Which allows the whole (dipper's RDF triples and parent ontologies)
        to function as a single ontology we can reason over when combined
        in a store such as SciGraph.

        Including more than the minimal ontological terms in dipper's RDF
        output constitutes a liability as it allows greater divergence
        between dipper artifacts and the proper ontologies.

        :param version_info: a string describing version info for the ontology
        :return:

        """
        model = Model(self.graph)
        model.addOntologyDeclaration(self.summary_level_curie)
        model.addOWLVersionIRI(self.summary_level_curie,
                               self.version_level_curie)
        if version_info is not None:
            model.addOWLVersionInfo(self.distribution_level_turtle_curie,
                                    version_info)

    @staticmethod
    def make_id(long_string, prefix='MONARCH'):
        """
        A method to create DETERMINISTIC identifiers
        based on a string's digest. currently implemented with sha1
        Duplicated from Source.py to avoid circular imports.
        :param long_string: string to use to generate identifier
        :param prefix: prefix to prepend to identifier [Monarch]
        :return: a Monarch identifier
        """
        return ':'.join((prefix, Dataset.hash_id(long_string)))

    @staticmethod
    def hash_id(word):  # same as graph/GraphUtils.digest_id(wordage)
        """
        Given a string, make a hash
        Duplicated from Source.py.

        :param word: str string to be hashed
        :return: hash of id
        """
        return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
Exemple #11
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian??? phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = self.globaltt['has phenotype']

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene,
                                                     relation, pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        model.addTriple(subject_id=pheno_id,
                        predicate_id=self.globaltt['subclass_of'],
                        obj=self.globaltt['phenotype'])

        # label nodes
        # pheno label

        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created Ref prefix in curie map to route to proper reference URL in SGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  self.globaltt['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return
Exemple #12
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id(
                definedby='yeastgenome.org',
                subject=gene,
                predicate=relation,
                object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id,
                        predicate_id=Model.object_properties['subclass_of'],
                        obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        # add the descripiton: all the unmodeled data in a '|' delimited list
        description = [
            'genomic_background: {}'.format(record['Strain Background']),
            'allele: {}'.format(record['Allele']),
            'chemical: {}'.format(record['Chemical']),
            'condition: {}'.format(record['Condition']),
            'details: {}'.format(record['Details']),
            'feature_name: {}'.format(record['Feature Name']),
            'gene_name: {}'.format(record['Gene Name']),
            'mutant_type: {}'.format(record['Mutant Type']),
            'reporter: {}'.format(record['Reporter']),
        ]
        g2p_assoc.description = " | ".join(description)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  Reference.ref_types['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return
Exemple #13
0
    def _add_study_provenance(self, phenotyping_center, colony, project_name,
                              pipeline_name, pipeline_stable_id,
                              procedure_stable_id, procedure_name,
                              parameter_stable_id, parameter_name,
                              statistical_method, resource_name):
        """
        :param phenotyping_center: str, from self.files['g2p_assertions']['columns']
        :param colony: str, from self.files['g2p_assertions']
        :param project_name: str, from self.files['g2p_assertions']
        :param pipeline_name: str, from self.files['g2p_assertions']
        :param pipeline_stable_id: str, from self.files['g2p_assertions']
        :param procedure_stable_id: str, from self.files['g2p_assertions']
        :param procedure_name: str, from self.files['g2p_assertions']
        :param parameter_stable_id: str, from self.files['g2p_assertions']
        :param parameter_name: str, from self.files['g2p_assertions']
        :param statistical_method: str, from self.files['g2p_assertions']
        :param resource_name: str, from self.files['g2p_assertions']
        :return: study bnode
        """

        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add provenance
        # A study is a blank node equal to its parts
        study_bnode = self.make_id(
            "{0}{1}{2}{3}{4}{5}{6}{7}".format(
                phenotyping_center,
                colony,
                project_name,  # switched to from 'project_fullname'  2020  V12
                pipeline_stable_id,
                procedure_stable_id,
                parameter_stable_id,
                statistical_method,
                resource_name),
            '_')

        model.addIndividualToGraph(study_bnode, None, self.globaltt['study'])

        # List of nodes linked to study with has_part property
        study_parts = []

        pipeline_curie = 'IMPC-pipe:' + pipeline_stable_id
        procedure_curie = 'IMPC-proc:' + procedure_stable_id
        parameter_curie = 'IMPC-param:' + procedure_stable_id
        parameter_curie += '#' + parameter_stable_id

        # Add study parts

        model.addIndividualToGraph(procedure_curie, procedure_name)
        study_parts.append(procedure_curie)  # ? stable or curie

        study_parts.append(self.resolve(statistical_method))
        provenance_model.add_study_parts(study_bnode, study_parts)

        # Add parameter/measure statement: study measures parameter
        parameter_label = "{0} ({1})".format(parameter_name, procedure_name)

        # logging.info("Adding Provenance for %s", project_name)
        model.addIndividualToGraph(parameter_curie, parameter_label)
        provenance_model.add_study_measure(study_bnode,
                                           parameter_curie,
                                           object_is_literal=False)

        # Add Colony
        colony_bnode = self.make_id("{0}".format(colony), '_')
        model.addIndividualToGraph(colony_bnode, colony)

        # Add study agent
        phenotyping_center_id = self.localtt[phenotyping_center]
        model.addIndividualToGraph(phenotyping_center_id, phenotyping_center,
                                   self.globaltt['organization'])

        # self.graph
        model.addTriple(study_bnode, self.globaltt['has_agent'],
                        phenotyping_center_id)

        # add pipeline and project
        model.addIndividualToGraph(pipeline_curie, pipeline_name)
        # self.graph
        model.addTriple(study_bnode, self.globaltt['part_of'], pipeline_curie)

        # as of V12 col 'project_fullname' became empty switched to 'project_name'
        if project_name is not None and project_name != '':
            for prj_nm in project_name.split(','):
                project_name_id = self.localtt[prj_nm]

                model.addIndividualToGraph(project_name_id, project_name,
                                           self.globaltt['project'])
                model.addTriple(study_bnode, self.globaltt['part_of'],
                                project_name_id)

        return study_bnode
Exemple #14
0
 def make_triples(self, source, package):
     model = Model(self.graph)
     if source == 'drugbank':
         for target in package['targets']:
             model.addTriple(subject_id=package['unii'],
                             predicate_id=target['action'],
                             obj=target['uniprot'])
             model.addLabel(subject_id=target['uniprot'],
                            label=target['name'])
             model.addTriple(subject_id=target['uniprot'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['polypeptide'])
             model.addTriple(subject_id=package['drugbank_id'],
                             predicate_id=self.globaltt['equivalent_class'],
                             obj=package['unii'])
             model.addTriple(
                 subject_id=target['action'],
                 predicate_id=self.globaltt['subPropertyOf'],
                 obj=self.globaltt['molecularly_interacts_with'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
     if source == 'drugcentral':
         for indication in package['indications']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['is substance that treats'],
                 obj=indication['snomed_id'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
             model.addTriple(subject_id=indication['snomed_id'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['disease'])
             model.addLabel(subject_id=indication['snomed_id'],
                            label=indication['snomed_name'])
         for interaction in package['interactions']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['molecularly_interacts_with'],
                 obj=interaction['uniprot'])
             # model.addLabel(
             #    subject_id=interaction['uniprot'],
             #    label='Protein_{}'.format(interaction['uniprot']))
             model.addLabel(subject_id=interaction['uniprot'],
                            label=interaction['target_name'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
             model.addDescription(subject_id=interaction['uniprot'],
                                  description=interaction['target_class'])
             model.addTriple(subject_id=interaction['uniprot'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['polypeptide'])
     return
Exemple #15
0
    def make_triples(self, source, package):
        model = Model(self.graph)
        if source == 'drugbank':
            for target in package['targets']:
                model.addTriple(subject_id=package['unii'],predicate_id=target['action'],obj=target['uniprot'])
                model.addLabel(subject_id=target['uniprot'], label=target['name'])
                model.addTriple(subject_id=target['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')
                model.addTriple(subject_id=package['drugbank_id'],
                                predicate_id=Model.object_properties['equivalent_class'],
                                obj=package['unii'])
                model.addTriple(subject_id=target['action'],
                                predicate_id='rdfs:subPropertyOf',
                                obj='RO:0002436')
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
        if source == 'drugcentral':
            for indication in package['indications']:
                model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addTriple(subject_id=indication['snomed_id'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='DOID:4')
                model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name'])
            for interaction in package['interactions']:
                model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot'])
                # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot']))
                model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class'])
                model.addTriple(subject_id=interaction['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')


        return
Exemple #16
0
    def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in dbxrefs.strip().split('|'):
            prefix = ':'.join(dbxref.split(':')[:-1]).strip()  # restore nonterminal ':'

            if prefix in self.localtt:
                prefix = self.localtt[prefix]

            # skip some of these for now based on curie prefix
            if prefix in filter_out:
                continue

            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))
            if dbxref_curie is not None:
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(
                        gene_id, self.globaltt['has gene product'], dbxref_curie)
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    omim_num = dbxref_curie[5:]
                    if omim_num in self.omim_replaced:
                        repl = self.omim_replaced[omim_num]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = 'OMIM:' + omim
                                model.addXref(gene_id, dbxref_curie)
                                omim_num = omim  # last wins

                    elif omim_num in self.omim_type and\
                            self.omim_type[omim_num] == self.globaltt['gene']:
                        model.addXref(gene_id, dbxref_curie)
                    else:
                        continue  # no equivilance between ncbigene and omin-nongene
                # designate clique leaders
                # (perhaps premature as this ingest can't know what else exists)
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)