def _build_gene_disease_model(self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple(assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple(subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple(subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'],predicate_id=target['action'],obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def _parse_aeolus_data(self, document, or_limit=None): model = Model(self.graph) rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui']) uni_curie = "UNII:{}".format(document['aeolus']['unii']) model.addLabel(rxcui_curie, document['aeolus']['drug_name']) model.addLabel(uni_curie, document['aeolus']['drug_name']) model.addSameIndividual(rxcui_curie, uni_curie) self.graph.addTriple(rxcui_curie, model.annotation_properties['inchi_key'], document['unii']['inchikey'], object_is_literal=True) if or_limit is not None: outcomes = (outcome for outcome in document['aeolus']['outcomes'] if 'ror' in outcome and outcome['ror'] >= or_limit) else: outcomes = (outcome for outcome in document['aeolus']['outcomes']) for outcome in outcomes: drug2outcome_assoc = Assoc(self.graph, self.name) meddra_curie = "MEDDRA:{}".format(outcome['code']) model.addLabel(meddra_curie, outcome['name']) drug2outcome_assoc.sub = rxcui_curie drug2outcome_assoc.obj = meddra_curie drug2outcome_assoc.rel = Assoc.object_properties[ 'causes_or_contributes'] drug2outcome_assoc.description = \ "A proportional reporting ratio or odds " \ "ratio greater than or equal to {} in the " \ "AEOLUS data was the significance cut-off " \ "used for creating drug-outcome associations".format(or_limit) drug2outcome_assoc.add_association_to_graph() drug2outcome_assoc.add_predicate_object( Assoc.annotation_properties['probabalistic_quantifier'], outcome['ror'], 'Literal') self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome) self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
def _parse_aeolus_data(self, document, or_limit=None): model = Model(self.graph) rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui']) uni_curie = "UNII:{}".format(document['aeolus']['unii']) model.addLabel(rxcui_curie, document['aeolus']['drug_name']) model.addLabel(uni_curie, document['aeolus']['drug_name']) model.addSameIndividual(rxcui_curie, uni_curie) self.graph.addTriple( rxcui_curie, self.globaltt['inchi_key'], document['unii']['inchikey'], object_is_literal=True) if or_limit is not None: outcomes = (outcome for outcome in document['aeolus']['outcomes'] if 'ror' in outcome and outcome['ror'] >= or_limit) else: outcomes = (outcome for outcome in document['aeolus']['outcomes']) for outcome in outcomes: drug2outcome_assoc = Assoc(self.graph, self.name) meddra_curie = "MEDDRA:{}".format(outcome['code']) model.addLabel(meddra_curie, outcome['name']) drug2outcome_assoc.sub = rxcui_curie drug2outcome_assoc.obj = meddra_curie drug2outcome_assoc.rel = self.globaltt['causes_or_contributes'] drug2outcome_assoc.description = \ "A proportional reporting ratio or odds " \ "ratio greater than or equal to {} in the " \ "AEOLUS data was the significance cut-off " \ "used for creating drug-outcome associations".format(or_limit) drug2outcome_assoc.add_association_to_graph() drug2outcome_assoc.add_predicate_object( self.globaltt['probabalistic_quantifier'], outcome['ror'], 'Literal') self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome) self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ def __init__( self, graph, feature_id=None, label=None, feature_type=None, description=None, feature_category=None ): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.gfxutl = GraphUtils(self.curie_map) self.fid = feature_id self.feature_category = feature_category self.label = label self.ftype = feature_type self.description = description self.start = None self.stop = None self.taxon = None def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None ): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None ): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.globaltt['Position']) return loc def _getStrandType(self, strand): """ :param strand: """ strand_id = None if strand == '+': strand_id = self.globaltt['plus_strand'] elif strand == '-': strand_id = self.globaltt['minus_strand'] elif strand == '.': strand_id = self.globaltt['both_strand'] elif strand is None: # assume this is Unknown pass else: LOG.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, add_region=True, region_id=None, feature_as_class=False, feature_category=None): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param add_region [True] :param region_id [None] :param feature_as_class [False] :param feature_category: a biolink category CURIE for feature """ if feature_category is None: feature_category = self.feature_category if feature_as_class: self.model.addClassToGraph( self.fid, self.label, self.ftype, self.description, class_category=feature_category) else: self.model.addIndividualToGraph( self.fid, self.label, self.ftype, self.description, ind_category=feature_category) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes(self.start['type']) if self.stop is not None and self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix # blank node, bnode rid = rid + "-Region" curie = '_:' + self.gfxutl.digest_id(rid) self.model.addLabel(curie, rid) region_id = curie self.graph.addTriple( self.fid, self.globaltt['location'], region_id, subject_category=feature_category ) self.model.addIndividualToGraph(region_id, None, self.globaltt['Region']) else: region_id = self.fid self.model.addType(region_id, self.globaltt['region']) # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId( self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph( self.start['reference'], self.start['coordinate'], self.start['type'], ) if self.stop is not None: endp = self._makePositionId( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.globaltt['plus_strand'] in tylist: strand = 'plus' elif self.globaltt['minus_strand'] in tylist: strand = 'minus' elif self.globaltt['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: bnode_curie """ # blank node, bnode if reference is None: LOG.error("Trying to make position with no reference.") return None reference = re.sub(r'\w+\:', '', reference, 1) if reference[0] == '_': # in this case the reference is a bnode curie as well # ... this is a bad smell of over modleing reference = reference[1:] unique_words = reference if coordinate is not None: # just in case it isn't a string already unique_words = '-'.join((unique_words, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: unique_words = '-'.join((unique_words, tstring)) curie = '_:' + self.gfxutl.digest_id(unique_words) # attach the wordage via a label # I want to see more of this (TEC 201905) # including a type should be mandatory as well self.model.addLabel(curie, unique_words) return curie def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # LOG.warn("No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id) if end_position_id is None: pass # LOG.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['end'], end_position_id) def addPositionToGraph( self, reference_id, position, position_types=None, strand=None ): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple( pos_id, self.globaltt['position'], position, object_is_literal=True, literal_type="xsd:integer" ) self.graph.addTriple( pos_id, self.globaltt['reference'], reference_id ) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) strnd = None if strand is not None: strnd = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it strnd = self._getStrandType(strand) # else: # strnd = self.globaltt['both_strand'] if strnd is None and (position_types is None or position_types == []): strnd = self.globaltt['Position'] if strnd is not None: self.model.addType(pos_id, strnd) return pos_id def addSubsequenceOfFeature( self, parentid, subject_category=None, object_category=None ): """ This will add reciprocal triples like: feature <is subsequence of> parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple( self.fid, self.globaltt['is subsequence of'], parentid, subject_category=subject_category, object_category=object_category ) # this should be expected to be done in reasoning not ETL self.graph.addTriple( parentid, self.globaltt['has subsequence'], self.fid, subject_category=object_category, object_category=subject_category ) def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ self.taxon = taxonid self.graph.addTriple( self.fid, self.globaltt['in taxon'], self.taxon, subject_category=self.feature_category ) def addFeatureProperty(self, property_type, feature_property): self.graph.addTriple( self.fid, property_type, feature_property, subject_category=self.feature_category )
def _build_gene_disease_model( self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc( self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple( assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_ncbi_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol']] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: LOG.warning( "unable to map intron variant to gene coordinates: %s", variant) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): LOG.warning( "unable to map intron variant to gene coordinates_2: %s", variant) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id def setTitle(self, title): self.title = title def setYear(self, year): self.year = year def setType(self, reference_type): self.ref_type = reference_type def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list def addAuthor(self, author): self.author_list += [author] def setShortCitation(self, citation): self.short_citation = citation def addPage(self, subject_id, page_url, subject_category=None, page_category=None): self.graph.addTriple( subject_id, self.globaltt['page'], # foaf:page not <sio:web page> page_url, object_is_literal=False, # URL is not a literal subject_category=subject_category, object_category=page_category) def addTitle(self, subject_id, title): if title is not None and title != '': self.graph.addTriple(subject_id, self.globaltt['title'], title, object_is_literal=True) def addRefToGraph(self): cite = self.short_citation if cite is None and self.title is not None: cite = self.title if self.ref_url is not None: if self.title is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) if cite is not None: self.model.addLabel(self.ref_url, cite) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true LOG.error("You are missing an identifier for a reference.")
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple( subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple( subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple( subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple( subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel( subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel( subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription( subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple( subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def _parse_patient_variants(self, file): """ :param file: file handler :return: """ patient_var_map = self._convert_variant_file_to_dict(file) gene_coordinate_map = self._parse_gene_coordinates( self.map_files['gene_coord_map']) rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map']) genotype = Genotype(self.graph) model = Model(self.graph) self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map) for patient in patient_var_map: patient_curie = ':{0}'.format(patient) # make intrinsic genotype for each patient intrinsic_geno_bnode = self.make_id( "{0}-intrinsic-genotype".format(patient), "_") genotype_label = "{0} genotype".format(patient) genotype.addGenotype( intrinsic_geno_bnode, genotype_label, model.globaltt['intrinsic_genotype']) self.graph.addTriple( patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode) for variant_id, variant in patient_var_map[patient].items(): build = variant['build'] chromosome = variant['chromosome'] position = variant['position'] reference_allele = variant['reference_allele'] variant_allele = variant['variant_allele'] genes_of_interest = variant['genes_of_interest'] rs_id = variant['rs_id'] variant_label = '' variant_bnode = self.make_id("{0}".format(variant_id), "_") # maybe should have these look like the elif statements below if position and reference_allele and variant_allele: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif not position and reference_allele and variant_allele \ and len(genes_of_interest) == 1: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif position and (not reference_allele or not variant_allele) \ and len(genes_of_interest) == 1: variant_label = "{0}{1}({2}):g.{3}".format( build, chromosome, genes_of_interest[0], position) elif len(genes_of_interest) == 1: variant_label = 'variant of interest in {0} gene of patient' \ ' {1}'.format(genes_of_interest[0], patient) else: variant_label = 'variant of interest in patient {0}'.format(patient) genotype.addSequenceAlteration(variant_bnode, None) # check if it we have built the label # in _add_variant_gene_relationship() labels = self.graph.objects( BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label']) label_list = list(labels) if len(label_list) == 0: model.addLabel(variant_bnode, variant_label) self.graph.addTriple( variant_bnode, self.globaltt['in taxon'], self.globaltt['H**o sapiens']) self.graph.addTriple( intrinsic_geno_bnode, self.globaltt['has_variant_part'], variant_bnode) if rs_id: dbsnp_curie = 'dbSNP:{0}'.format(rs_id) model.addSameIndividual(variant_bnode, dbsnp_curie) self._add_variant_sameas_relationships(patient_var_map, rs_map) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ ref_types = { 'person': 'foaf:Person', 'journal_article': 'IAO:0000013', 'publication': 'IAO:0000311', # book 'document': 'IAO:0000310', # document??? 'photograph': 'IAO:0000185', 'webpage': 'SIO:000302', } annotation_properties = { 'page': 'foaf:page', 'title': 'dc:title' } def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) if ref_type is None: self.ref_type = self.ref_types['document'] else: self.ref_type = ref_type if ref_id is not None and re.match(r'http', ref_id): self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple( subject_id, self.annotation_properties['page'], page_url, object_is_literal=True) return def addTitle(self, subject_id, title): self.graph.addTriple( subject_id, self.annotation_properties['title'], title, object_is_literal=True) return def addRefToGraph(self): n = self.short_citation if n is None: n = self.title if self.ref_url is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) self.model.addLabel(self.ref_url, n) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, n, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def process_gaf(self, gaffile, limit, id_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None: # try/except much faster than checking # for dict key membership try: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 except KeyError: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addLabel(gene_id, gene_symbol) model.addType(gene_id, self.globaltt['gene']) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple(gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None and\ syn.split(':')[0] not in self.wont_prefix: syn = syn.strip() LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) if syn != '': model.addSynonym(gene_id, syn) elif syn != '': model.addSynonym(gene_id, syn) # First taxon is for the gene, after the pipe are interacting taxa tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon') # this is a required field but good to safe if tax_curie: geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = self.gaf_eco[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'[|,]', with_or_from) # OR + AND phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning("Skipping %s from or with %s", uniprotid, itm) continue # sanity check/conversion on go curie prefix (pfx, lclid) = itm.split(':')[-2:] # last prefix wins if pfx in self.localtt: pfx = self.localtt[pfx] itm = ':'.join((pfx, lclid)) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the idmapping_selected download", uniprot_per, uniprot_tot)
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append( { 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_') ) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], Reference.ref_types['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _parse_patient_variants(self, file): """ :param file: file handler :return: """ patient_var_map = self._convert_variant_file_to_dict(file) gene_coordinate_map = self._parse_gene_coordinates( self.map_files['gene_coord_map']) rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map']) genotype = Genotype(self.graph) model = Model(self.graph) self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map) for patient in patient_var_map: patient_curie = 'MONARCH:{0}'.format(patient) # make intrinsic genotype for each patient intrinsic_geno_bnode = self.make_id( "{0}-intrinsic-genotype".format(patient), "_") genotype_label = "{0} genotype".format(patient) genotype.addGenotype(intrinsic_geno_bnode, genotype_label, model.globaltt['intrinsic genotype']) self.graph.addTriple(patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode) for variant_id, variant in patient_var_map[patient].items(): build = variant['build'] chromosome = variant['chromosome'] position = variant['position'] reference_allele = variant['reference_allele'] variant_allele = variant['variant_allele'] genes_of_interest = variant['genes_of_interest'] rs_id = variant['rs_id'] variant_label = '' variant_bnode = self.make_id("{0}".format(variant_id), "_") # maybe should have these look like the elif statements below if position and reference_allele and variant_allele: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif not position and reference_allele and variant_allele \ and len(genes_of_interest) == 1: variant_label = self._build_variant_label( build, chromosome, position, reference_allele, variant_allele, genes_of_interest) elif position and (not reference_allele or not variant_allele) \ and len(genes_of_interest) == 1: variant_label = "{0}{1}({2}):g.{3}".format( build, chromosome, genes_of_interest[0], position) elif len(genes_of_interest) == 1: variant_label = 'variant of interest in {0} gene of patient' \ ' {1}'.format(genes_of_interest[0], patient) else: variant_label = 'variant of interest in patient {0}'.format( patient) genotype.addSequenceAlteration(variant_bnode, None) # check if it we have built the label # in _add_variant_gene_relationship() labels = self.graph.objects( BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label']) label_list = list(labels) if len(label_list) == 0: model.addLabel(variant_bnode, variant_label) self.graph.addTriple(variant_bnode, self.globaltt['in taxon'], self.globaltt['H**o sapiens']) self.graph.addTriple(intrinsic_geno_bnode, self.globaltt['has_variant_part'], variant_bnode) if rs_id: dbsnp_curie = 'dbSNP:{0}'.format(rs_id) model.addSameIndividual(variant_bnode, dbsnp_curie) self._add_variant_sameas_relationships(patient_var_map, rs_map) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_id is not None and re.match(r'http', ref_id): self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple(subject_id, self.globaltt['page'], page_url, object_is_literal=True) return def addTitle(self, subject_id, title): self.graph.addTriple(subject_id, self.globaltt['title (dce)'], title, object_is_literal=True) return def addRefToGraph(self): n = self.short_citation if n is None: n = self.title if self.ref_url is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) self.model.addLabel(self.ref_url, n) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, n, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id( definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) # add the descripiton: all the unmodeled data in a '|' delimited list description = [ 'genomic_background: {}'.format(record['Strain Background']), 'allele: {}'.format(record['Allele']), 'chemical: {}'.format(record['Chemical']), 'condition: {}'.format(record['Condition']), 'details: {}'.format(record['Details']), 'feature_name: {}'.format(record['Feature Name']), 'gene_name: {}'.format(record['Gene Name']), 'mutant_type: {}'.format(record['Mutant Type']), 'reporter: {}'.format(record['Reporter']), ] g2p_assoc.description = " | ".join(description) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], Reference.ref_types['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ LOG.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: graph = self.graph model = Model(graph) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:' + biogrid_num prefix = self.localtt[id_type] # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC,WormBase,XenBase,FlyBase'.split( ',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addLabel(biogrid_id, id_num) model.addType(biogrid_id, self.globaltt['gene']) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() return
def _add_variant_gene_relationship(self, patient_var_map, gene_coordinate_map): """ Right now it is unclear the best approach on how to connect variants to genes. In most cases has_affected_locus/GENO:0000418 is accurate; however, there are cases where a variant is in the intron on one gene and is purported to causally affect another gene down or upstream. In these cases we must first disambiguate which gene is the affected locus, and which gene(s) are predicated to be causully influenced by (RO:0002566) UPDATE 8-30: In the latest dataset we no longer have 1-many mappings between variants and genes, but leaving this here in case we see these in the future The logic followed here is: if mutation type contains downstream/upstream and more than one gene of interest, investigate coordinates of all genes to see if we can disambiguate which genes are which :return: None """ # genotype = Genotype(self.graph) dipper_util = DipperUtil() model = Model(self.graph) # Note this could be compressed in someway to remove one level of for looping for patient in patient_var_map: for variant_id, variant in patient_var_map[patient].items(): variant_bnode = self.make_id("{0}".format(variant_id), "_") genes_of_interest = variant['genes_of_interest'] if len(genes_of_interest) == 1: # Assume variant is variant allele of gene gene = genes_of_interest[0] gene_id = dipper_util.get_hgnc_id_from_symbol(gene) self._add_gene_to_graph( gene, variant_bnode, gene_id, self.globaltt['has_affected_feature']) elif re.search(r'upstream|downstream', variant['type'], flags=re.I): # Attempt to disambiguate ref_gene = [] up_down_gene = [] unmatched_genes = [] for gene in variant['genes_of_interest']: if gene_id and gene_id != '' and gene_id in gene_coordinate_map: if gene_coordinate_map[gene_id]['start'] \ <= variant['position']\ <= gene_coordinate_map[gene_id]['end']: gene_info = { 'symbol': gene, 'strand': gene_coordinate_map[gene_id]['strand'] } ref_gene.append(gene_info) else: up_down_gene.append(gene) else: unmatched_genes.append(gene) if len(ref_gene) == 1: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # update label with gene gene_list = [ref_gene[0]['symbol'] ] # build label expects list variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # In some cases there are multiple instances # of same gene from dupe rows in the source # Credit http://stackoverflow.com/a/3844832 elif len(ref_gene) > 0 and ref_gene[1:] == ref_gene[:-1]: self._add_gene_to_graph( ref_gene[0]['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) # build label function expects list gene_list = [ref_gene[0]['symbol']] variant_label = self._build_variant_label( variant['build'], variant['chromosome'], variant['position'], variant['reference_allele'], variant['variant_allele'], gene_list) model.addLabel(variant_bnode, variant_label) # Check if reference genes are on different strands elif len(ref_gene) == 2: strands = [st['strand'] for st in ref_gene] if "minus" in strands and "plus" in strands: for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['has_affected_feature']) else: LOG.warning( "unable to map intron variant to gene coordinates: %s", variant) for r_gene in ref_gene: self._add_gene_to_graph( r_gene['symbol'], variant_bnode, gene_id, self.globaltt['causally_influences']) elif re.search(r'intron', variant['type'], flags=re.I): LOG.warning( "unable to map intron variant to gene coordinates_2: %s", variant) for neighbor in up_down_gene: self._add_gene_to_graph( neighbor, variant_bnode, gene_id, self.globaltt['causally_influences']) # Unmatched genes are likely because we cannot map to an NCBIGene # or we do not have coordinate information for unmatched_gene in unmatched_genes: self._add_gene_to_graph( unmatched_gene, variant_bnode, gene_id, self.globaltt['causally_influences']) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple( subject_id, self.globaltt['page'], # foaf:page not <sio:web page> page_url, object_is_literal=True) return def addTitle(self, subject_id, title): if title is not None and title != '': self.graph.addTriple( subject_id, self.globaltt['title (dce)'], title, object_is_literal=True) return def addRefToGraph(self): cite = self.short_citation if cite is None and self.title is not None: cite = self.title if self.ref_url is not None: if self.title is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) if cite is not None: self.model.addLabel(self.ref_url, cite) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true LOG.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for auth in self.author_list: # gu.addTriple( # graph, self.ref_id, self.props['has_author'], auth, True) return
def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) # add the feature to the graph hap_description = None if risk_allele_frequency not in ['', 'NR']: hap_description = str( risk_allele_frequency) + ' [risk allele frequency]' model.addIndividualToGraph(hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description) geno.addTaxon(self.globaltt["H**o sapiens"], hap_id) snp_labels = re.split(r';\s?', hap_label) chrom_nums = re.split(r';\s?', chrom_num) chrom_positions = re.split(r';\s?', chrom_pos) context_list = re.split(r';\s?', context) mapped_genes = re.split(r';\s?', mapped_gene) # Not having four "PAX5" as a list might be better, but it breaks unit tests # mapped_genes = list(set(mapped_genes)) # make uniq # snp_labels = list(set(snp_labels)) # make uniq snp_curies = list() for snp in snp_labels: snp_curie, snp_type = self._get_curie_and_type_from_id(snp) if snp_type is None: LOG.info('cant find type for SNP in %s', snp) # make blank node snp_curie = self.make_id(snp, "_") model.addLabel(snp_curie, snp) elif snp_curie[0] == '_': # arrived an unlabeled blanknode model.addLabel(snp_curie, snp) graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie) snp_curies.append(snp_curie) # courtesy http://stackoverflow.com/a/16720915 # check lengths of mutiple lists length = len(snp_curies) if not all( len(lst) == length for lst in [snp_labels, chrom_nums, chrom_positions, context_list]): LOG.warning( "Incongruous data field(s) for haplotype %s \n " "will not add snp details", hap_label) else: variant_in_gene_count = 0 for index, snp_curie in enumerate(snp_curies): self._add_snp_to_graph(snp_curie, snp_labels[index], chrom_nums[index], chrom_positions[index], context_list[index]) if mapped_genes and len(mapped_genes) != len(snp_labels): LOG.warning("More mapped genes than snps," " cannot disambiguate for\n%s\n%s", mapped_genes, snp_labels) # hap_label) else: so_class = self.resolve(context_list[index]) so_query = """ SELECT ?variant_label WHERE {{ {0} rdfs:subClassOf+ {1} ; rdfs:label ?variant_label . }} """.format(so_class, self.globaltt['gene_variant']) query_result = so_ontology.query(so_query) gene_id = DipperUtil.get_hgnc_id_from_symbol( mapped_genes[index]) if gene_id is not None and len(list(query_result)) == 1: if context_list[index] in [ 'upstream_gene_variant', 'downstream_gene_variant' ]: graph.addTriple(snp_curie, self.resolve(context_list[index]), gene_id) else: geno.addAffectedLocus(snp_curie, gene_id) variant_in_gene_count += 1 # Seperate in case we want to apply a different relation # If not this is redundant with triples added above if len(mapped_genes) == variant_in_gene_count and \ len(set(mapped_genes)) == 1: gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0]) geno.addAffectedLocus(hap_id, gene_id)
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian??? phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = self.globaltt['has phenotype'] if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene, relation, pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) model.addTriple(subject_id=pheno_id, predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['Phenotype']) # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created Ref prefix in curie map to route to proper reference URL in SGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], self.globaltt['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return