Beispiel #1
0
    def _build_gene_disease_model(self,
                                  gene_id,
                                  relation_id,
                                  disease_id,
                                  variant_label,
                                  consequence_predicate=None,
                                  consequence_id=None,
                                  allelic_requirement=None,
                                  pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode, consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode, variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id,
                         relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(assoc.assoc_id,
                            self.globaltt['has_allelic_requirement'],
                            allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Beispiel #2
0
 def make_triples(self, source, package):
     model = Model(self.graph)
     if source == 'drugbank':
         for target in package['targets']:
             model.addTriple(subject_id=package['unii'],
                             predicate_id=target['action'],
                             obj=target['uniprot'])
             model.addLabel(subject_id=target['uniprot'],
                            label=target['name'])
             model.addTriple(subject_id=target['uniprot'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['polypeptide'])
             model.addTriple(subject_id=package['drugbank_id'],
                             predicate_id=self.globaltt['equivalent_class'],
                             obj=package['unii'])
             model.addTriple(
                 subject_id=target['action'],
                 predicate_id=self.globaltt['subPropertyOf'],
                 obj=self.globaltt['molecularly_interacts_with'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
     if source == 'drugcentral':
         for indication in package['indications']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['is substance that treats'],
                 obj=indication['snomed_id'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
             model.addTriple(subject_id=indication['snomed_id'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['disease'])
             model.addLabel(subject_id=indication['snomed_id'],
                            label=indication['snomed_name'])
         for interaction in package['interactions']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['molecularly_interacts_with'],
                 obj=interaction['uniprot'])
             # model.addLabel(
             #    subject_id=interaction['uniprot'],
             #    label='Protein_{}'.format(interaction['uniprot']))
             model.addLabel(subject_id=interaction['uniprot'],
                            label=interaction['target_name'])
             model.addTriple(subject_id=package['unii'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['molecular entity'])
             model.addDescription(subject_id=interaction['uniprot'],
                                  description=interaction['target_class'])
             model.addTriple(subject_id=interaction['uniprot'],
                             predicate_id=self.globaltt['subclass_of'],
                             obj=self.globaltt['polypeptide'])
     return
Beispiel #3
0
    def make_triples(self, source, package):
        model = Model(self.graph)
        if source == 'drugbank':
            for target in package['targets']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id=target['action'],
                                obj=target['uniprot'])
                model.addLabel(subject_id=target['uniprot'], label=target['name'])
                model.addTriple(subject_id=target['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')
                model.addTriple(subject_id=package['drugbank_id'],
                                predicate_id=Model.object_properties['equivalent_class'],
                                obj=package['unii'])
                model.addTriple(subject_id=target['action'],
                                predicate_id='rdfs:subPropertyOf',
                                obj='RO:0002436')
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
        if source == 'drugcentral':
            for indication in package['indications']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id='RO:0002606',
                                obj=indication['snomed_id'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addTriple(subject_id=indication['snomed_id'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='DOID:4')
                model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name'])
            for interaction in package['interactions']:
                model.addTriple(subject_id=package['unii'],
                                predicate_id='RO:0002436',
                                obj=interaction['uniprot'])
                # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot']))
                model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class'])
                model.addTriple(subject_id=interaction['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')


        return
Beispiel #4
0
    def make_triples(self, source, package):
        model = Model(self.graph)
        if source == 'drugbank':
            for target in package['targets']:
                model.addTriple(subject_id=package['unii'],predicate_id=target['action'],obj=target['uniprot'])
                model.addLabel(subject_id=target['uniprot'], label=target['name'])
                model.addTriple(subject_id=target['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')
                model.addTriple(subject_id=package['drugbank_id'],
                                predicate_id=Model.object_properties['equivalent_class'],
                                obj=package['unii'])
                model.addTriple(subject_id=target['action'],
                                predicate_id='rdfs:subPropertyOf',
                                obj='RO:0002436')
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
        if source == 'drugcentral':
            for indication in package['indications']:
                model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addTriple(subject_id=indication['snomed_id'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='DOID:4')
                model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name'])
            for interaction in package['interactions']:
                model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot'])
                # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot']))
                model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name'])
                model.addTriple(subject_id=package['unii'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='CHEBI:23367')
                model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class'])
                model.addTriple(subject_id=interaction['uniprot'],
                                predicate_id=Model.object_properties['subclass_of'],
                                obj='SO:0000104')


        return
Beispiel #5
0
    def _parse_aeolus_data(self, document, or_limit=None):
        model = Model(self.graph)

        rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui'])
        uni_curie = "UNII:{}".format(document['aeolus']['unii'])
        model.addLabel(rxcui_curie, document['aeolus']['drug_name'])
        model.addLabel(uni_curie, document['aeolus']['drug_name'])

        model.addSameIndividual(rxcui_curie, uni_curie)
        self.graph.addTriple(rxcui_curie,
                             model.annotation_properties['inchi_key'],
                             document['unii']['inchikey'],
                             object_is_literal=True)

        if or_limit is not None:
            outcomes = (outcome for outcome in document['aeolus']['outcomes']
                        if 'ror' in outcome and outcome['ror'] >= or_limit)
        else:
            outcomes = (outcome for outcome in document['aeolus']['outcomes'])

        for outcome in outcomes:
            drug2outcome_assoc = Assoc(self.graph, self.name)

            meddra_curie = "MEDDRA:{}".format(outcome['code'])
            model.addLabel(meddra_curie, outcome['name'])

            drug2outcome_assoc.sub = rxcui_curie
            drug2outcome_assoc.obj = meddra_curie
            drug2outcome_assoc.rel = Assoc.object_properties[
                'causes_or_contributes']
            drug2outcome_assoc.description = \
                "A proportional reporting ratio or odds " \
                "ratio greater than or equal to {} in the " \
                "AEOLUS data was the significance cut-off " \
                "used for creating drug-outcome associations".format(or_limit)
            drug2outcome_assoc.add_association_to_graph()
            drug2outcome_assoc.add_predicate_object(
                Assoc.annotation_properties['probabalistic_quantifier'],
                outcome['ror'], 'Literal')

            self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome)
            self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
Beispiel #6
0
    def _parse_aeolus_data(self, document, or_limit=None):
        model = Model(self.graph)

        rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui'])
        uni_curie = "UNII:{}".format(document['aeolus']['unii'])
        model.addLabel(rxcui_curie, document['aeolus']['drug_name'])
        model.addLabel(uni_curie, document['aeolus']['drug_name'])

        model.addSameIndividual(rxcui_curie, uni_curie)
        self.graph.addTriple(
            rxcui_curie, self.globaltt['inchi_key'], document['unii']['inchikey'],
            object_is_literal=True)

        if or_limit is not None:
            outcomes = (outcome for outcome in document['aeolus']['outcomes']
                        if 'ror' in outcome and outcome['ror'] >= or_limit)
        else:
            outcomes = (outcome for outcome in document['aeolus']['outcomes'])

        for outcome in outcomes:
            drug2outcome_assoc = Assoc(self.graph, self.name)

            meddra_curie = "MEDDRA:{}".format(outcome['code'])
            model.addLabel(meddra_curie, outcome['name'])

            drug2outcome_assoc.sub = rxcui_curie
            drug2outcome_assoc.obj = meddra_curie
            drug2outcome_assoc.rel = self.globaltt['causes_or_contributes']
            drug2outcome_assoc.description = \
                "A proportional reporting ratio or odds " \
                "ratio greater than or equal to {} in the " \
                "AEOLUS data was the significance cut-off " \
                "used for creating drug-outcome associations".format(or_limit)
            drug2outcome_assoc.add_association_to_graph()
            drug2outcome_assoc.add_predicate_object(
                self.globaltt['probabalistic_quantifier'], outcome['ror'], 'Literal')

            self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome)
            self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
Beispiel #7
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        """
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False,
            feature_category=None):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param add_region [True]
        :param region_id [None]
        :param feature_as_class [False]
        :param feature_category: a biolink category CURIE for feature
        """

        if feature_category is None:
            feature_category = self.feature_category

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description,
                class_category=feature_category)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description,
                ind_category=feature_category)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                # blank node, bnode
                rid = rid + "-Region"
                curie = '_:' + self.gfxutl.digest_id(rid)
                self.model.addLabel(curie, rid)
                region_id = curie

            self.graph.addTriple(
                self.fid,
                self.globaltt['location'],
                region_id,
                subject_category=feature_category
            )
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'],
            )

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return: bnode_curie
        """
        # blank node, bnode
        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        reference = re.sub(r'\w+\:', '', reference, 1)
        if reference[0] == '_':
            # in this case the reference is a bnode curie as well
            # ... this is a bad smell of over modleing
            reference = reference[1:]
        unique_words = reference
        if coordinate is not None:
            # just in case it isn't a string already
            unique_words = '-'.join((unique_words, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                unique_words = '-'.join((unique_words, tstring))

        curie = '_:' + self.gfxutl.digest_id(unique_words)

        # attach the wordage via a label
        # I want to see more of this (TEC 201905)
        # including a type should be mandatory as well
        self.model.addLabel(curie, unique_words)
        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None
    ):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id,
                self.globaltt['position'],
                position,
                object_is_literal=True,
                literal_type="xsd:integer"
            )
        self.graph.addTriple(
            pos_id, self.globaltt['reference'], reference_id
        )
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(
            self, parentid, subject_category=None, object_category=None
    ):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(
            self.fid,
            self.globaltt['is subsequence of'],
            parentid,
            subject_category=subject_category,
            object_category=object_category
        )
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(
            parentid,
            self.globaltt['has subsequence'],
            self.fid,
            subject_category=object_category,
            object_category=subject_category
        )

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(
            self.fid,
            self.globaltt['in taxon'],
            self.taxon,
            subject_category=self.feature_category
        )

    def addFeatureProperty(self, property_type, feature_property):

        self.graph.addTriple(
            self.fid,
            property_type,
            feature_property,
            subject_category=self.feature_category
        )
Beispiel #8
0
    def _build_gene_disease_model(
            self,
            gene_id,
            relation_id,
            disease_id,
            variant_label,
            consequence_predicate=None,
            consequence_id=None,
            allelic_requirement=None,
            pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode,
                            consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode,
                                       variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(
            self.graph, self.name, variant_or_gene, disease_id, relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(
                assoc.assoc_id, self.globaltt['has_allelic_requirement'],
                allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Beispiel #9
0
    def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map):
        """
        Right now it is unclear the best approach on how to connect
        variants to genes.  In most cases has_affected_locus/GENO:0000418
        is accurate; however, there are cases where a variant is in the intron
        on one gene and is purported to causally affect another gene down or
        upstream.  In these cases we must first disambiguate which gene
        is the affected locus, and which gene(s) are predicated to be
        causully influenced by (RO:0002566)

        UPDATE 8-30: In the latest dataset we no longer have 1-many mappings
        between variants and genes, but leaving this here in case we see
        these in the future

        The logic followed here is:
        if mutation type contains downstream/upstream and more than one
        gene of interest, investigate coordinates of all genes to
        see if we can disambiguate which genes are which
        :return: None
        """
        # genotype = Genotype(self.graph)
        dipper_util = DipperUtil()
        model = Model(self.graph)
        # Note this could be compressed in someway to remove one level of for looping
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                genes_of_interest = variant['genes_of_interest']
                if len(genes_of_interest) == 1:
                    # Assume variant is variant allele of gene
                    gene = genes_of_interest[0]
                    gene_id = dipper_util.get_ncbi_id_from_symbol(gene)
                    self._add_gene_to_graph(
                        gene, variant_bnode, gene_id,
                        self.globaltt['has_affected_feature'])

                elif re.search(r'upstream|downstream', variant['type'], flags=re.I):
                    # Attempt to disambiguate
                    ref_gene = []
                    up_down_gene = []
                    unmatched_genes = []
                    for gene in variant['genes_of_interest']:
                        if gene_id and gene_id != '' and gene_id in gene_coordinate_map:
                            if gene_coordinate_map[gene_id]['start'] \
                                    <= variant['position']\
                                    <= gene_coordinate_map[gene_id]['end']:
                                gene_info = {
                                    'symbol': gene,
                                    'strand': gene_coordinate_map[gene_id]['strand']
                                }
                                ref_gene.append(gene_info)
                            else:
                                up_down_gene.append(gene)
                        else:
                            unmatched_genes.append(gene)
                    if len(ref_gene) == 1:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # update label with gene
                        gene_list = [ref_gene[0]['symbol']]  # build label expects list
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # In some cases there are multiple instances
                    # of same gene from dupe rows in the source
                    # Credit http://stackoverflow.com/a/3844832
                    elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # build label function expects list
                        gene_list = [ref_gene[0]['symbol']]
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # Check if reference genes are on different strands
                    elif len(ref_gene) == 2:
                        strands = [st['strand'] for st in ref_gene]
                        if "minus" in strands and "plus" in strands:
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['has_affected_feature'])
                        else:
                            LOG.warning(
                                "unable to map intron variant to gene coordinates: %s",
                                variant)
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['causally_influences'])
                    elif re.search(r'intron', variant['type'], flags=re.I):
                        LOG.warning(
                            "unable to map intron variant to gene coordinates_2: %s",
                            variant)
                    for neighbor in up_down_gene:
                        self._add_gene_to_graph(
                            neighbor, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])
                    # Unmatched genes are likely because we cannot map to an NCBIGene
                    # or we do not have coordinate information
                    for unmatched_gene in unmatched_genes:
                        self._add_gene_to_graph(
                            unmatched_gene, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])

        return
Beispiel #10
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

    def setTitle(self, title):
        self.title = title

    def setYear(self, year):
        self.year = year

    def setType(self, reference_type):
        self.ref_type = reference_type

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list

    def addAuthor(self, author):
        self.author_list += [author]

    def setShortCitation(self, citation):
        self.short_citation = citation

    def addPage(self,
                subject_id,
                page_url,
                subject_category=None,
                page_category=None):
        self.graph.addTriple(
            subject_id,
            self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url,
            object_is_literal=False,  # URL is not a literal
            subject_category=subject_category,
            object_category=page_category)

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(subject_id,
                                 self.globaltt['title'],
                                 title,
                                 object_is_literal=True)

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")
Beispiel #11
0
 def make_triples(self, source, package):
     model = Model(self.graph)
     if source == 'drugbank':
         for target in package['targets']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=target['action'],
                 obj=target['uniprot'])
             model.addLabel(subject_id=target['uniprot'], label=target['name'])
             model.addTriple(
                 subject_id=target['uniprot'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['polypeptide'])
             model.addTriple(
                 subject_id=package['drugbank_id'],
                 predicate_id=self.globaltt['equivalent_class'],
                 obj=package['unii'])
             model.addTriple(
                 subject_id=target['action'],
                 predicate_id=self.globaltt['subPropertyOf'],
                 obj=self.globaltt['molecularly_interacts_with'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
     if source == 'drugcentral':
         for indication in package['indications']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['is substance that treats'],
                 obj=indication['snomed_id'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
             model.addTriple(
                 subject_id=indication['snomed_id'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['disease'])
             model.addLabel(
                 subject_id=indication['snomed_id'], label=indication['snomed_name'])
         for interaction in package['interactions']:
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['molecularly_interacts_with'],
                 obj=interaction['uniprot'])
             # model.addLabel(
             #    subject_id=interaction['uniprot'],
             #    label='Protein_{}'.format(interaction['uniprot']))
             model.addLabel(
                 subject_id=interaction['uniprot'], label=interaction['target_name'])
             model.addTriple(
                 subject_id=package['unii'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['molecular entity'])
             model.addDescription(
                 subject_id=interaction['uniprot'],
                 description=interaction['target_class'])
             model.addTriple(
                 subject_id=interaction['uniprot'],
                 predicate_id=self.globaltt['subclass_of'],
                 obj=self.globaltt['polypeptide'])
     return
Beispiel #12
0
    def _parse_patient_variants(self, file):
        """
        :param file: file handler
        :return:
        """
        patient_var_map = self._convert_variant_file_to_dict(file)
        gene_coordinate_map = self._parse_gene_coordinates(
            self.map_files['gene_coord_map'])
        rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map'])

        genotype = Genotype(self.graph)
        model = Model(self.graph)

        self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map)

        for patient in patient_var_map:
            patient_curie = ':{0}'.format(patient)
            # make intrinsic genotype for each patient
            intrinsic_geno_bnode = self.make_id(
                "{0}-intrinsic-genotype".format(patient), "_")
            genotype_label = "{0} genotype".format(patient)
            genotype.addGenotype(
                intrinsic_geno_bnode, genotype_label,
                model.globaltt['intrinsic_genotype'])

            self.graph.addTriple(
                patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode)
            for variant_id, variant in patient_var_map[patient].items():
                build = variant['build']
                chromosome = variant['chromosome']
                position = variant['position']
                reference_allele = variant['reference_allele']
                variant_allele = variant['variant_allele']
                genes_of_interest = variant['genes_of_interest']
                rs_id = variant['rs_id']

                variant_label = ''
                variant_bnode = self.make_id("{0}".format(variant_id), "_")

                # maybe should have these look like the elif statements below
                if position and reference_allele and variant_allele:
                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif not position and reference_allele and variant_allele \
                        and len(genes_of_interest) == 1:

                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele, variant_allele,
                        genes_of_interest)
                elif position and (not reference_allele or not variant_allele) \
                        and len(genes_of_interest) == 1:

                    variant_label = "{0}{1}({2}):g.{3}".format(
                        build, chromosome, genes_of_interest[0], position)
                elif len(genes_of_interest) == 1:
                    variant_label = 'variant of interest in {0} gene of patient' \
                        ' {1}'.format(genes_of_interest[0], patient)
                else:
                    variant_label = 'variant of interest in patient {0}'.format(patient)

                genotype.addSequenceAlteration(variant_bnode, None)
                # check if it we have built the label
                # in _add_variant_gene_relationship()
                labels = self.graph.objects(
                    BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label'])

                label_list = list(labels)

                if len(label_list) == 0:
                    model.addLabel(variant_bnode, variant_label)

                self.graph.addTriple(
                    variant_bnode, self.globaltt['in taxon'],
                    self.globaltt['H**o sapiens'])
                self.graph.addTriple(
                    intrinsic_geno_bnode, self.globaltt['has_variant_part'],
                    variant_bnode)
                if rs_id:
                    dbsnp_curie = 'dbSNP:{0}'.format(rs_id)
                    model.addSameIndividual(variant_bnode, dbsnp_curie)

        self._add_variant_sameas_relationships(patient_var_map, rs_map)
        return
Beispiel #13
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    ref_types = {
        'person': 'foaf:Person',
        'journal_article': 'IAO:0000013',
        'publication': 'IAO:0000311',  # book
        'document': 'IAO:0000310',  # document???
        'photograph': 'IAO:0000185',
        'webpage': 'SIO:000302',
    }

    annotation_properties = {
        'page': 'foaf:page',
        'title': 'dc:title'
    }

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)

        if ref_type is None:
            self.ref_type = self.ref_types['document']
        else:
            self.ref_type = ref_type

        if ref_id is not None and re.match(r'http', ref_id):
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id, self.annotation_properties['page'],
            page_url, object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        self.graph.addTriple(
            subject_id, self.annotation_properties['title'],
            title, object_is_literal=True)
        return

    def addRefToGraph(self):

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            self.model.addLabel(self.ref_url, n)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, n, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Beispiel #14
0
    def process_gaf(self, gaffile, limit, id_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                delimiter='\t',
                                quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None:
                        # try/except much faster than checking
                        # for dict key membership
                        try:
                            gene_id = id_map[gene_num]
                            uniprotid = ':'.join((dbase, gene_num))
                            (dbase, gene_num) = gene_id.split(':')
                            uniprot_hit += 1
                        except KeyError:
                            # LOG.warning(
                            #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                            #    gene_num)
                            uniprot_miss += 1
                            continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addLabel(gene_id, gene_symbol)
                model.addType(gene_id, self.globaltt['gene'])

                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(gene_id,
                                            self.globaltt['has gene product'],
                                            syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None and\
                                syn.split(':')[0] not in self.wont_prefix:
                            syn = syn.strip()
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            if syn != '':
                                model.addSynonym(gene_id, syn)
                        elif syn != '':
                            model.addSynonym(gene_id, syn)

                # First taxon is for the gene, after the pipe are interacting taxa
                tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon')
                # this is a required field but good to safe
                if tax_curie:
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = self.gaf_eco[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[-2]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'[|,]', with_or_from)  # OR + AND
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning("Skipping  %s from or with %s",
                                        uniprotid, itm)
                            continue
                        # sanity check/conversion on go curie prefix
                        (pfx, lclid) = itm.split(':')[-2:]  # last prefix wins
                        if pfx in self.localtt:
                            pfx = self.localtt[pfx]
                        itm = ':'.join((pfx, lclid))

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(
                                itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm,
                                             phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[-2]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the idmapping_selected download",
                uniprot_per, uniprot_tot)
Beispiel #15
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append(
                {
                    'id': self.apo_term_id[exp_type],
                    'term': exp_type,
                })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality': False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_')
            )
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
            assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation,
                                                     object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(
                self.graph, references[0],
                Reference.ref_types['publication']
            )
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return
Beispiel #16
0
    def _parse_patient_variants(self, file):
        """
        :param file: file handler
        :return:
        """
        patient_var_map = self._convert_variant_file_to_dict(file)
        gene_coordinate_map = self._parse_gene_coordinates(
            self.map_files['gene_coord_map'])
        rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map'])

        genotype = Genotype(self.graph)
        model = Model(self.graph)

        self._add_variant_gene_relationship(patient_var_map,
                                            gene_coordinate_map)

        for patient in patient_var_map:
            patient_curie = 'MONARCH:{0}'.format(patient)
            # make intrinsic genotype for each patient
            intrinsic_geno_bnode = self.make_id(
                "{0}-intrinsic-genotype".format(patient), "_")
            genotype_label = "{0} genotype".format(patient)
            genotype.addGenotype(intrinsic_geno_bnode, genotype_label,
                                 model.globaltt['intrinsic genotype'])

            self.graph.addTriple(patient_curie, model.globaltt['has_genotype'],
                                 intrinsic_geno_bnode)
            for variant_id, variant in patient_var_map[patient].items():
                build = variant['build']
                chromosome = variant['chromosome']
                position = variant['position']
                reference_allele = variant['reference_allele']
                variant_allele = variant['variant_allele']
                genes_of_interest = variant['genes_of_interest']
                rs_id = variant['rs_id']

                variant_label = ''
                variant_bnode = self.make_id("{0}".format(variant_id), "_")

                # maybe should have these look like the elif statements below
                if position and reference_allele and variant_allele:
                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif not position and reference_allele and variant_allele \
                        and len(genes_of_interest) == 1:

                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif position and (not reference_allele or not variant_allele) \
                        and len(genes_of_interest) == 1:

                    variant_label = "{0}{1}({2}):g.{3}".format(
                        build, chromosome, genes_of_interest[0], position)
                elif len(genes_of_interest) == 1:
                    variant_label = 'variant of interest in {0} gene of patient' \
                        ' {1}'.format(genes_of_interest[0], patient)
                else:
                    variant_label = 'variant of interest in patient {0}'.format(
                        patient)

                genotype.addSequenceAlteration(variant_bnode, None)
                # check if it we have built the label
                # in _add_variant_gene_relationship()
                labels = self.graph.objects(
                    BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label'])

                label_list = list(labels)

                if len(label_list) == 0:
                    model.addLabel(variant_bnode, variant_label)

                self.graph.addTriple(variant_bnode, self.globaltt['in taxon'],
                                     self.globaltt['H**o sapiens'])
                self.graph.addTriple(intrinsic_geno_bnode,
                                     self.globaltt['has_variant_part'],
                                     variant_bnode)
                if rs_id:
                    dbsnp_curie = 'dbSNP:{0}'.format(rs_id)
                    model.addSameIndividual(variant_bnode, dbsnp_curie)

        self._add_variant_sameas_relationships(patient_var_map, rs_map)
        return
Beispiel #17
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """
    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type

        if ref_id is not None and re.match(r'http', ref_id):
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(subject_id,
                             self.globaltt['page'],
                             page_url,
                             object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        self.graph.addTriple(subject_id,
                             self.globaltt['title (dce)'],
                             title,
                             object_is_literal=True)
        return

    def addRefToGraph(self):

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            self.model.addLabel(self.ref_url, n)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, n, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Beispiel #18
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id(
                definedby='yeastgenome.org',
                subject=gene,
                predicate=relation,
                object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id,
                        predicate_id=Model.object_properties['subclass_of'],
                        obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        # add the descripiton: all the unmodeled data in a '|' delimited list
        description = [
            'genomic_background: {}'.format(record['Strain Background']),
            'allele: {}'.format(record['Allele']),
            'chemical: {}'.format(record['Chemical']),
            'condition: {}'.format(record['Condition']),
            'details: {}'.format(record['Details']),
            'feature_name: {}'.format(record['Feature Name']),
            'gene_name: {}'.format(record['Gene Name']),
            'mutant_type: {}'.format(record['Mutant Type']),
            'reporter: {}'.format(record['Reporter']),
        ]
        g2p_assoc.description = " | ".join(description)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  Reference.ref_types['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return
Beispiel #19
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        LOG.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.test_mode:
                    graph = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    graph = self.graph

                model = Model(graph)

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:' + biogrid_num
                prefix = self.localtt[id_type]

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC,WormBase,XenBase,FlyBase'.split(
                    ',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) and (organism_label.strip()
                                                     in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters
                            is not None) and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        model.addEquivalentClass(biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        model.addLabel(biogrid_id, id_num)
                        model.addType(biogrid_id, self.globaltt['gene'])

                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()

        return
Beispiel #20
0
    def _add_variant_gene_relationship(self, patient_var_map,
                                       gene_coordinate_map):
        """
        Right now it is unclear the best approach on how to connect
        variants to genes.  In most cases has_affected_locus/GENO:0000418
        is accurate; however, there are cases where a variant is in the intron
        on one gene and is purported to causally affect another gene down or
        upstream.  In these cases we must first disambiguate which gene
        is the affected locus, and which gene(s) are predicated to be
        causully influenced by (RO:0002566)

        UPDATE 8-30: In the latest dataset we no longer have 1-many mappings
        between variants and genes, but leaving this here in case we see
        these in the future

        The logic followed here is:
        if mutation type contains downstream/upstream and more than one
        gene of interest, investigate coordinates of all genes to
        see if we can disambiguate which genes are which
        :return: None
        """
        # genotype = Genotype(self.graph)
        dipper_util = DipperUtil()
        model = Model(self.graph)
        # Note this could be compressed in someway to remove one level of for looping
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                genes_of_interest = variant['genes_of_interest']
                if len(genes_of_interest) == 1:
                    # Assume variant is variant allele of gene
                    gene = genes_of_interest[0]
                    gene_id = dipper_util.get_hgnc_id_from_symbol(gene)
                    self._add_gene_to_graph(
                        gene, variant_bnode, gene_id,
                        self.globaltt['has_affected_feature'])

                elif re.search(r'upstream|downstream',
                               variant['type'],
                               flags=re.I):
                    # Attempt to disambiguate
                    ref_gene = []
                    up_down_gene = []
                    unmatched_genes = []
                    for gene in variant['genes_of_interest']:
                        if gene_id and gene_id != '' and gene_id in gene_coordinate_map:
                            if gene_coordinate_map[gene_id]['start'] \
                                    <= variant['position']\
                                    <= gene_coordinate_map[gene_id]['end']:
                                gene_info = {
                                    'symbol':
                                    gene,
                                    'strand':
                                    gene_coordinate_map[gene_id]['strand']
                                }
                                ref_gene.append(gene_info)
                            else:
                                up_down_gene.append(gene)
                        else:
                            unmatched_genes.append(gene)
                    if len(ref_gene) == 1:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # update label with gene
                        gene_list = [ref_gene[0]['symbol']
                                     ]  # build label expects list
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # In some cases there are multiple instances
                    # of same gene from dupe rows in the source
                    # Credit http://stackoverflow.com/a/3844832
                    elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]:
                        self._add_gene_to_graph(
                            ref_gene[0]['symbol'], variant_bnode, gene_id,
                            self.globaltt['has_affected_feature'])

                        # build label function expects list
                        gene_list = [ref_gene[0]['symbol']]
                        variant_label = self._build_variant_label(
                            variant['build'], variant['chromosome'],
                            variant['position'], variant['reference_allele'],
                            variant['variant_allele'], gene_list)
                        model.addLabel(variant_bnode, variant_label)

                    # Check if reference genes are on different strands
                    elif len(ref_gene) == 2:
                        strands = [st['strand'] for st in ref_gene]
                        if "minus" in strands and "plus" in strands:
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['has_affected_feature'])
                        else:
                            LOG.warning(
                                "unable to map intron variant to gene coordinates: %s",
                                variant)
                            for r_gene in ref_gene:
                                self._add_gene_to_graph(
                                    r_gene['symbol'], variant_bnode, gene_id,
                                    self.globaltt['causally_influences'])
                    elif re.search(r'intron', variant['type'], flags=re.I):
                        LOG.warning(
                            "unable to map intron variant to gene coordinates_2: %s",
                            variant)
                    for neighbor in up_down_gene:
                        self._add_gene_to_graph(
                            neighbor, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])
                    # Unmatched genes are likely because we cannot map to an NCBIGene
                    # or we do not have coordinate information
                    for unmatched_gene in unmatched_genes:
                        self._add_gene_to_graph(
                            unmatched_gene, variant_bnode, gene_id,
                            self.globaltt['causally_influences'])

        return
Beispiel #21
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id, self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url, object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(
                subject_id, self.globaltt['title (dce)'], title, object_is_literal=True)
        return

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for auth in self.author_list:
        #        gu.addTriple(
        #           graph, self.ref_id, self.props['has_author'], auth, True)
        return
Beispiel #22
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency not in ['', 'NR']:
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)

        # Not having four "PAX5" as a list might be better, but it breaks unit tests
        # mapped_genes = list(set(mapped_genes)) # make uniq
        # snp_labels = list(set(snp_labels)) # make uniq

        snp_curies = list()

        for snp in snp_labels:
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                LOG.info('cant find type for SNP in %s', snp)
                # make blank node
                snp_curie = self.make_id(snp, "_")
                model.addLabel(snp_curie, snp)
            elif snp_curie[0] == '_':  # arrived an unlabeled blanknode
                model.addLabel(snp_curie, snp)

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        # check lengths of mutiple lists
        length = len(snp_curies)
        if not all(
                len(lst) == length for lst in
            [snp_labels, chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Incongruous data field(s) for haplotype %s \n "
                "will not add snp details", hap_label)
        else:

            variant_in_gene_count = 0
            for index, snp_curie in enumerate(snp_curies):
                self._add_snp_to_graph(snp_curie, snp_labels[index],
                                       chrom_nums[index],
                                       chrom_positions[index],
                                       context_list[index])

                if mapped_genes and len(mapped_genes) != len(snp_labels):
                    LOG.warning("More mapped genes than snps,"
                                " cannot disambiguate for\n%s\n%s",
                                mapped_genes, snp_labels)  # hap_label)
                else:
                    so_class = self.resolve(context_list[index])
                    so_query = """
        SELECT ?variant_label
        WHERE {{
            {0} rdfs:subClassOf+ {1} ;
            rdfs:label ?variant_label .
        }}
                    """.format(so_class, self.globaltt['gene_variant'])

                    query_result = so_ontology.query(so_query)

                    gene_id = DipperUtil.get_hgnc_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None and len(list(query_result)) == 1:
                        if context_list[index] in [
                                'upstream_gene_variant',
                                'downstream_gene_variant'
                        ]:
                            graph.addTriple(snp_curie,
                                            self.resolve(context_list[index]),
                                            gene_id)
                        else:
                            geno.addAffectedLocus(snp_curie, gene_id)
                            variant_in_gene_count += 1

            # Seperate in case we want to apply a different relation
            # If not this is redundant with triples added above
            if len(mapped_genes) == variant_in_gene_count and \
                    len(set(mapped_genes)) == 1:
                gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0])
                geno.addAffectedLocus(hap_id, gene_id)
Beispiel #23
0
    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian??? phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = self.globaltt['has phenotype']

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene,
                                                     relation, pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        model.addTriple(subject_id=pheno_id,
                        predicate_id=self.globaltt['subclass_of'],
                        obj=self.globaltt['Phenotype'])

        # label nodes
        # pheno label

        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created Ref prefix in curie map to route to proper reference URL in SGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  self.globaltt['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return