def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): # constants tax_id = 'NCBITaxon:9606' genome_version = 'GRCh38' if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( g, snp_id, snp_label.strip(), Feature.types['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: model.addType(snp_id, cid) return
def _add_snp_to_graph(self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = str( risk_allele_frequency) + ' [risk allele frequency]' feat = Feature(graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': feat.addFeatureStartLocation(chrom_pos, chrom_id) feat.addFeatureEndLocation(chrom_pos, chrom_id) feat.addFeatureToGraph() feat.addTaxonToFeature(self.globaltt['H**o sapiens']) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for ctx in re.split(r';', context): ctx = ctx.strip() cid = self.resolve(ctx, False) if cid != ctx: model.addType(snp_id, cid) return
def _add_snp_to_graph( self, snp_id, snp_label, chrom_num, chrom_pos, context, risk_allele_frequency=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) if chrom_num != '' and chrom_pos != '': location = self._make_location_curie(chrom_num, chrom_pos) if location not in self.id_location_map: self.id_location_map[location] = set() else: location = None alteration = re.search(r'-(.*)$', snp_id) if alteration is not None and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO if location is not None: self.id_location_map[location].add(snp_id) # create the chromosome chrom_id = makeChromID(chrom_num, self.localtt['reference assembly'], 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency is not None\ and risk_allele_frequency != ''\ and risk_allele_frequency != 'NR': snp_description = str(risk_allele_frequency) + ' [risk allele frequency]' feat = Feature( graph, snp_id, snp_label.strip(), self.globaltt['SNP'], snp_description) if chrom_num != '' and chrom_pos != '': feat.addFeatureStartLocation(chrom_pos, chrom_id) feat.addFeatureEndLocation(chrom_pos, chrom_id) feat.addFeatureToGraph() feat.addTaxonToFeature(self.globaltt['H**o sapiens']) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for ctx in re.split(r';', context): ctx = ctx.strip() cid = self.resolve(ctx, False) if cid != ctx: model.addType(snp_id, cid) return
def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank): """ :param gene_id: str Non curified ID :param gene_label: str Gene symbol :param anatomy_curie: str curified anatomy term :param rank: str rank :return: None """ g2a_association = Assoc(self.graph, self.name) model = Model(self.graph) gene_curie = "ENSEMBL:{}".format(gene_id) rank = re.sub(r',', '', str(rank)) # ? can't do RE on a float ... model.addType(gene_curie, self.globaltt['gene']) g2a_association.sub = gene_curie g2a_association.obj = anatomy_curie g2a_association.rel = self.globaltt['expressed in'] g2a_association.add_association_to_graph() g2a_association.add_predicate_object(self.globaltt['has_quantifier'], float(rank), 'Literal', 'xsd:float')
def _process_pathway(self, row): """ Process row of CTD data from CTD_genes_pathways.tsv.gz and generate triples Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.graph) self._check_list_len(row, 4) (gene_symbol, gene_id, pathway_name, pathway_id) = row if self.test_mode and (int(gene_id) not in self.test_geneids): return entrez_id = 'NCBIGene:' + gene_id pathways_to_scrub = [ 'REACT:REACT_116125', # disease "REACT:REACT_111045", # developmental biology "REACT:REACT_200794", # Mus musculus biological processes "REACT:REACT_13685" ] # neuronal system ? if pathway_id in pathways_to_scrub: # these are lame "pathways" like generic # "disease" and "developmental biology" return # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345 if re.match(r'KEGG', pathway_id): pathway_id = re.sub(r'KEGG:', 'KEGG-path:map', pathway_id) # just in case, add it as a class model.addType(entrez_id, self.globaltt['gene']) self.pathway.addPathway(pathway_id, pathway_name) self.pathway.addGeneToPathway(entrez_id, pathway_id) return
def _process_protein_links( self, dataframe, p2gene_map, taxon, limit=None, rank_min=700 ): model = Model(self.graph) filtered_df = dataframe[dataframe['combined_score'] > rank_min] filtered_out_count = 0 for index, row in filtered_df.iterrows(): # Check if proteins are in same species protein1 = row['protein1'].replace('{}.'.format(taxon), '') protein2 = row['protein2'].replace('{}.'.format(taxon), '') gene1_curies = None gene2_curies = None try: # Keep orientation the same since RO!"interacts with" is symmetric # TEC: symeteric expansion is the job of post processing not ingest if protein1 >= protein2: gene1_curies = p2gene_map[protein1] gene2_curies = p2gene_map[protein2] else: gene1_curies = p2gene_map[protein2] gene2_curies = p2gene_map[protein1] except KeyError: filtered_out_count += 1 if gene1_curies is not None and gene2_curies is not None: for gene1 in gene1_curies: for gene2 in gene2_curies: model.addType(gene1, self.globaltt['gene']) model.addType(gene2, self.globaltt['gene']) self.graph.addTriple( gene1, self.globaltt['interacts with'], gene2) if limit is not None and index >= limit: break LOG.info( "Finished parsing p-p interactions for %s, " "%i rows filtered out based on checking ensembl proteins", taxon, filtered_out_count)
def process_gaf(self, gaffile, limit, id_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None: # try/except much faster than checking # for dict key membership try: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 except KeyError: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addLabel(gene_id, gene_symbol) model.addType(gene_id, self.globaltt['gene']) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple(gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None and\ syn.split(':')[0] not in self.wont_prefix: syn = syn.strip() LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) if syn != '': model.addSynonym(gene_id, syn) elif syn != '': model.addSynonym(gene_id, syn) # First taxon is for the gene, after the pipe are interacting taxa tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon') # this is a required field but good to safe if tax_curie: geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = self.gaf_eco[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'[|,]', with_or_from) # OR + AND phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning("Skipping %s from or with %s", uniprotid, itm) continue # sanity check/conversion on go curie prefix (pfx, lclid) = itm.split(':')[-2:] # last prefix wins if pfx in self.localtt: pfx = self.localtt[pfx] itm = ':'.join((pfx, lclid)) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the idmapping_selected download", uniprot_per, uniprot_tot)
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = { 'association': 'OBAN:association' } annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', 'inchi_key': 'CHEBI:InChIKey', 'probabalistic_quantifier': 'GENO:0000867' } object_properties = { 'has disposition': 'RO:0000091', 'has_phenotype': 'RO:0002200', 'expressed_in': 'RO:0002206', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_predicate', 'is_about': 'IAO:0000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance', 'causes_or_contributes': 'RO:0003302' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004', 'has_quantifier': 'GENO:0000866', 'created_on': 'pav:createdOn' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() self.model.addType(self.assoc_id, self.assoc_types['association']) self.graph.addTriple( self.assoc_id, self.object_properties['has_subject'], self.sub) self.graph.addTriple( self.assoc_id, self.object_properties['has_object'], self.obj) self.graph.addTriple( self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.graph.addTriple( self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.graph.addTriple( self.assoc_id, self.object_properties['has_source'], s, True) else: self.graph.addTriple( self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.graph.addTriple( self.assoc_id, self.object_properties['has_provenance'], p) if self.date is not None and len(self.date) > 0: for d in self.date: self.graph.addTriple( object_is_literal=True, subject_id=self.assoc_id, predicate_id=self.datatype_properties['created_on'], obj=d) if self.score is not None: self.graph.addTriple( self.assoc_id, self.properties['has_measurement'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self): self._add_basic_association_to_graph() return def add_predicate_object(self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple(self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but if we must, # prefer use of property decorator def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource # (although the point of a digest is to render such details moot). items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join(('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id def setTitle(self, title): self.title = title def setYear(self, year): self.year = year def setType(self, reference_type): self.ref_type = reference_type def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list def addAuthor(self, author): self.author_list += [author] def setShortCitation(self, citation): self.short_citation = citation def addPage(self, subject_id, page_url, subject_category=None, page_category=None): self.graph.addTriple( subject_id, self.globaltt['page'], # foaf:page not <sio:web page> page_url, object_is_literal=False, # URL is not a literal subject_category=subject_category, object_category=page_category) def addTitle(self, subject_id, title): if title is not None and title != '': self.graph.addTriple(subject_id, self.globaltt['title'], title, object_is_literal=True) def addRefToGraph(self): cite = self.short_citation if cite is None and self.title is not None: cite = self.title if self.ref_url is not None: if self.title is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) if cite is not None: self.model.addLabel(self.ref_url, cite) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true LOG.error("You are missing an identifier for a reference.")
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map return def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.globaltt['intrinsic_genotype'] self.model.addIndividualToGraph(genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if allele_type is None: allele_type = self.globaltt['allele'] # TODO is this a good idea? self.model.addIndividualToGraph(allele_id, allele_label, allele_type, allele_description) return def addGene(self, gene_id, gene_label=None, gene_type=None, gene_description=None): ''' genes are classes ''' if gene_type is None: gene_type = self.globaltt['gene'] self.model.addClassToGraph(gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.model.addIndividualToGraph(construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.graph.addTriple(child_id, self.globaltt['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_allele_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt["is_allele_of"] self.graph.addTriple(allele_id, rel_id, gene_id) return def addAffectedLocus(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:has_affected_feature. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.globaltt['has_affected_feature'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addGeneProduct(self, sequence_id, product_id, product_label=None, product_type=None): """ Add gene/variant/allele has_gene_product relationship Can be used to either describe a gene to transcript relationship or gene to protein :param sequence_id: :param product_id: :param product_label: :param product_type: :return: """ if product_label is not None and product_type is not None: self.model.addIndividualToGraph(product_id, product_label, product_type) self.graph.addTriple(sequence_id, self.globaltt['has gene product'], product_id) return def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.globaltt['polypeptide'] self.model.addIndividualToGraph(polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.graph.addTriple(transcript_id, self.globaltt['translates_to'], polypeptide_id) return def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.globaltt['homozygous'] else: zygosity_id = self.globaltt['heterozygous'] if zygosity_id is not None: self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.globaltt['has_part'] # Fail loudly if parent or child identifiers are None if parent_id is None: raise TypeError('Attempt to pass None as parent') elif part_id is None: raise TypeError('Attempt to pass None as child') elif part_relationship is None: part_relationship = self.globaltt['has_part'] self.graph.addTriple(parent_id, part_relationship, part_id) return def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.globaltt['sequence_alteration'] self.model.addIndividualToGraph(sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.globaltt['has_variant_part']) return def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addIndividualToGraph(background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype(self, background_id, genotype_id, background_type=None): if background_type is None: background_type = self.globaltt['genomic_background'] self.model.addType(background_id, background_type) self.addParts(background_id, genotype_id, self.globaltt['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, # assuming it's a extrinsic_genotype self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'], reagent_id) return def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.model.addIndividualToGraph(reagent_id, reagent_label, reagent_type, description) self.graph.addTriple(reagent_id, self.globaltt['targets_gene'], gene_id) return def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_gene <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if targeted_gene_id is None: targeted_gene_id = '_' + gene_id + '-' + reagent_id targeted_gene_id = targeted_gene_id.replace(":", "") self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label, self.globaltt['reagent_targeted_gene'], description) if gene_id is not None: self.graph.addTriple(targeted_gene_id, self.globaltt['is_expression_variant_of'], gene_id) self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'], reagent_id) return def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.globaltt['targeted_gene_subregion'] self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.graph.addTriple(population_id, self.globaltt['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.globaltt['targeted_gene_complement'] self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_num, taxon_label=None, genome_id=None): ncbitaxon = 'NCBITaxon:' + taxon_num if taxon_label is None: if ncbitaxon in self.globaltcid: taxon_label = self.globaltcid[ncbitaxon] else: logging.warning('Add ' + ncbitaxon + ' to global translation table') taxon_label = taxon_num elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[ ncbitaxon]: logging.warning('"' + self.globaltcid[ncbitaxon] + '" may need updating from "' + taxon_label + '" in global translation table') logging.warning( '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] + '"' + ' may need to be added to a local translation table') genome_label = taxon_label + ' genome' if genome_id is None: genome_id = self.makeGenomeID(taxon_num) self.model.addClassToGraph(genome_id, genome_label, self.globaltt['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.model.addIndividualToGraph(build_id, build_label, self.globaltt['reference_genome']) self.model.addType(build_id, genome_id) if re.match(r'[0-9]+', taxon_id): taxon_id = 'NCBITaxon:' + taxon_id self.addTaxon(taxon_id, build_id) return @staticmethod def makeGenomeID(taxon_id): # scrub off the taxon prefix. put it in base space # TODO: revisit as yet another BNODE? # should never be called if a real genome iri exists # should create the opaque bode and label together # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome' genome_id = '_:' + taxon_id + 'genome' return genome_id def addChromosome(self, chrom, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family(self.graph) # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chrom), tax_id) if tax_label is not None: chr_label = makeChromLabel(chrom, tax_label) else: chr_label = makeChromLabel(chrom) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph(chr_id, chr_label, self.globaltt['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chrom, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chrom, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id) family.addMemberOf(chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph(chrom_class_id, chrom_class_label, self.globaltt['chromosome']) return def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph(chr_id, chr_label, self.globaltt['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) # usage dependent, todo: ommit return @staticmethod def make_variant_locus_label(gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip() + '<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if gene_label is None and allele1_label is None and allele2_label is None: LOG.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label def make_experimental_model_with_genotype(self, genotype_id, genotype_label, taxon_id, taxon_label): animal_id = '-'.join((taxon_id, 'with', genotype_id)) animal_id = re.sub(r':', '', animal_id) animal_id = '_:' + animal_id animal_label = ' '.join((genotype_label, taxon_label)) self.model.addIndividualToGraph(animal_id, animal_label, taxon_id) self.graph.addTriple(animal_id, self.globaltt['has_genotype'], genotype_id) return animal_id
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) # elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dct:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple( self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple( self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple( self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: logger.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: logger.info("set version to %s", self.version) self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple( self.identifier, 'dct:issued', date_issued, object_is_literal=True) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info( "No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): self.version = self.identifier+version_num self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple( dipperized_version, 'dct:isVersionOf', self.version) self.graph.addTriple( dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple( dipperized_version, 'dct:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ # special genotype parts mapped to their # GENO and SO classes that we explicitly reference here genoparts = { 'intrinsic_genotype': 'GENO:0000000', 'extrinsic_genotype': 'GENO:0000524', 'effective_genotype': 'GENO:0000525', 'sex_qualified_genotype': 'GENO:0000645', 'male_genotype': 'GENO:0000646', 'female_genotype': 'GENO:0000647', 'genomic_background': 'GENO:0000611', 'unspecified_genomic_background': 'GENO:0000649', 'genomic_variation_complement': 'GENO:0000009', 'karyotype_variation_complement': 'GENO:0000644', 'variant_single_locus_complement': 'GENO:0000030', 'variant_locus': 'GENO:0000002', 'reference_locus': 'GENO:0000036', 'allele': 'GENO:0000512', 'gene': 'SO:0000704', 'QTL': 'SO:0000771', 'transgene': 'SO:0000902', # not really used any more 'transgenic_insertion': 'SO:0001218', 'pseudogene': 'SO:0000336', 'cytogenetic marker': 'SO:0000341', 'sequence_feature': 'SO:0000110', 'sequence_alteration': 'SO:0001059', 'insertion': 'SO:0000667', 'deletion': 'SO:0000159', 'substitution': 'SO:1000002', 'duplication': 'SO:1000035', 'translocation': 'SO:0000199', 'inversion': 'SO:1000036', 'tandem_duplication': 'SO:1000173', 'point_mutation': 'SO:1000008', 'population': 'PCO:0000001', # population 'family': 'PCO:0000020', # family 'wildtype': 'GENO:0000511', 'reagent_targeted_gene': 'GENO:0000504', 'targeted_gene_subregion': 'GENO:0000534', 'targeted_gene_complement': 'GENO:0000527', 'biological_region': 'SO:0001411', 'missense_variant': 'SO:0001583', 'transcript': 'SO:0000233', 'polypeptide': 'SO:0000104', 'cDNA': 'SO:0000756', 'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118', 'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125', 'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120', 'sequence_variant_affecting_polypeptide_function': 'SO:1000117', 'regulatory_transgene_feature': 'GENO:0000637', 'coding_transgene_feature': 'GENO:0000638', 'protein_coding_gene': 'SO:0001217', 'ncRNA_gene': 'SO:0001263', 'RNAi_reagent': 'SO:0000337', 'heritable_phenotypic_marker': 'SO:0001500' } object_properties = { 'is_mutant_of': 'GENO:0000440', 'derives_from': 'RO:0001000', 'has_alternate_part': 'GENO:0000382', 'has_reference_part': 'GENO:0000385', 'has_sex_agnostic_genotype_part': 'GENO:0000650', 'in_taxon': 'RO:0002162', 'has_zygosity': 'GENO:0000608', # is_seq_var_inst_of links a alternate locus (instance) # to a gene (class) 'is_sequence_variant_instance_of': 'GENO:0000408', 'targets_instance_of': 'GENO:0000414', 'is_reference_instance_of': 'GENO:0000610', 'has_part': 'BFO:0000051', # use has_member_with_allelotype when relating populations 'has_member_with_allelotype': 'GENO:0000225', 'is_allelotype_of': 'GENO:0000206', 'has_genotype': 'GENO:0000222', 'has_phenotype': 'RO:0002200', 'has_gene_product': 'RO:0002205', 'translates_to': 'RO:0002513', 'is_targeted_expression_variant_of': 'GENO:0000443', 'is_transgene_variant_of': 'GENO:0000444', 'has_variant_part': 'GENO:0000382', # targeted_by isa between a (reagent-targeted gene) and a morpholino 'targeted_by': 'GENO:0000634', # FIXME should derives_sequence_from_gene just be subsequence of? 'derives_sequence_from_gene': 'GENO:0000639', 'has_affected_locus': 'GENO:0000418' } annotation_properties = { # TODO change properties with # https://github.com/monarch-initiative/GENO-ontology/issues/21 # FIXME # reference_nucleotide, reference_amino_acid, altered_nucleotide # results_in_amino_acid_change are FIXME Made up terms 'reference_nucleotide': 'GENO:reference_nucleotide', 'reference_amino_acid': 'GENO:reference_amino_acid', 'altered_nucleotide': 'GENO:altered_nucleotide', 'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' } zygosity = { 'homoplasmic': 'GENO:0000602', 'heterozygous': 'GENO:0000135', 'indeterminate': 'GENO:0000137', 'heteroplasmic': 'GENO:0000603', 'hemizygous-y': 'GENO:0000604', 'hemizygous-x': 'GENO:0000605', 'homozygous': 'GENO:0000136', 'hemizygous': 'GENO:0000606', 'complex_heterozygous': 'GENO:0000402', 'simple_heterozygous': 'GENO:0000458' } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) return def addGenotype( self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.genoparts['intrinsic_genotype'] self.model.addIndividualToGraph( genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele( self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if allele_type is None: allele_type = self.genoparts['allele'] # TODO is this a good idea? self.model.addIndividualToGraph( allele_id, allele_label, allele_type, allele_description) return def addGene( self, gene_id, gene_label, gene_type=None, gene_description=None): if gene_type is None: gene_type = self.genoparts['gene'] # genes are classes self.model.addClassToGraph( gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.model.addIndividualToGraph(construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.graph.addTriple( child_id, self.properties['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.graph.addTriple( child_id, self.properties['derives_sequence_from_gene'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.properties['is_sequence_variant_instance_of'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addAffectedLocus(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if rel_id is None: rel_id = self.properties['has_affected_locus'] self.graph.addTriple(allele_id, rel_id, gene_id) return def addGeneProduct( self, sequence_id, product_id, product_label=None, product_type=None): """ Add gene/variant/allele has_gene_product relationship Can be used to either describe a gene to transcript relationship or gene to protein :param sequence_id: :param product_id: :param product_label: :param product_type: :return: """ if product_label is not None and product_type is not None: self.model.addIndividualToGraph( product_id, product_label, product_type) self.graph.addTriple( sequence_id, self.properties['has_gene_product'], product_id) return def addPolypeptide( self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.genoparts['polypeptide'] self.model.addIndividualToGraph( polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.graph.addTriple( transcript_id, self.properties['translates_to'], polypeptide_id) return def addPartsToVSLC( self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 # vslc = gu.getNode(vslc_id) # TODO unused if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.zygosity['homozygous'] else: zygosity_id = self.zygosity['heterozygous'] if zygosity_id is not None: self.graph.addTriple( vslc_id, self.properties['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts( vslc_id, parent_id, self.properties['has_alternate_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.properties['has_part'] self.graph.addTriple(parent_id, part_relationship, part_id) return def addSequenceAlteration( self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.genoparts['sequence_alteration'] self.model.addIndividualToGraph( sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.properties['has_alternate_part']) return def addGenomicBackground( self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.model.addIndividualToGraph( background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype( self, background_id, genotype_id, background_type=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.model.addType(background_id, background_type) self.addParts(background_id, genotype_id, self.object_properties['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ self.graph.addTriple( genopart_id, self.properties['in_taxon'], taxon_id) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, # assuming it's a extrinsic_genotype self.graph.addTriple( genotype_id, self.properties['has_variant_part'], reagent_id) return def addGeneTargetingReagent( self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.model.addIndividualToGraph( reagent_id, reagent_label, reagent_type, description) self.graph.addTriple( reagent_id, self.object_properties['targets_instance_of'], gene_id) return def addReagentTargetedGene( self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_instance_of <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if targeted_gene_id is None: targeted_gene_id = '_' + gene_id + '-' + reagent_id targeted_gene_id = targeted_gene_id.replace(":", "") self.model.addIndividualToGraph( targeted_gene_id, targeted_gene_label, self.genoparts['reagent_targeted_gene'], description) if gene_id is not None: self.graph.addTriple( targeted_gene_id, self.object_properties['is_targeted_expression_variant_of'], gene_id) self.graph.addTriple( targeted_gene_id, self.properties['targeted_by'], reagent_id) return def addTargetedGeneSubregion( self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.genoparts['targeted_gene_subregion'] self.model.addIndividualToGraph( tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.graph.addTriple( population_id, self.properties['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement( self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.genoparts['targeted_gene_complement'] self.model.addIndividualToGraph( tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_id, taxon_label=None): if taxon_label is None: taxon_label = taxon_id genome_label = taxon_label+' genome' genome_id = self.makeGenomeID(taxon_id) self.model.addClassToGraph( genome_id, genome_label, Feature.types['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.model.addIndividualToGraph( build_id, build_label, Feature.types['reference_genome']) self.model.addType(build_id, genome_id) self.addTaxon(taxon_id, build_id) return def makeGenomeID(self, taxon_id): # scrub off the taxon prefix. put it in base space # TODO: revisit as BNODE? genome_id = re.sub(r'.*\:', ':', taxon_id) + 'genome' return genome_id def addChromosome( self, chr, tax_id, tax_label=None, build_id=None, build_label=None): """ if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. If a build is included, punn the chromosome as a subclass of SO:chromsome, and make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the build or genome. """ family = Family() # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.model.addClassToGraph( chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: # the build-specific chromosome chrinbuild_id = makeChromID(chr, build_id) if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.model.addIndividualToGraph( chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(build_id, chrinbuild_id) family.addMemberOf(chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) # the chrom class (generic) id chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.model.addClassToGraph( chrom_class_id, chrom_class_label, Feature.types['chromosome']) return def addChromosomeInstance( self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr_num: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ family = Family(self.graph) chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.model.addIndividualToGraph( chr_id, chr_label, Feature.types['chromosome']) if chr_type is not None: self.model.addType(chr_id, chr_type) # add the build-specific chromosome # as a member of the build (both ways) family.addMember(reference_id, chr_id) family.addMemberOf(chr_id, reference_id) return def make_variant_locus_label(self, gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip()+'<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if gene_label is None and \ allele1_label is None and allele2_label is None: logger.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label def make_experimental_model_with_genotype( self, genotype_id, genotype_label, taxon_id, taxon_label): animal_id = '-'.join((taxon_id, 'with', genotype_id)) animal_id = re.sub(r':', '', animal_id) animal_id = '_:'+animal_id animal_label = ' '.join((genotype_label, taxon_label)) self.model.addIndividualToGraph(animal_id, animal_label, taxon_id) self.graph.addTriple( animal_id, Genotype.object_properties['has_genotype'], genotype_id) return animal_id
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_id is not None and re.match(r'http', ref_id): self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple(subject_id, self.globaltt['page'], page_url, object_is_literal=True) return def addTitle(self, subject_id, title): self.graph.addTriple(subject_id, self.globaltt['title (dce)'], title, object_is_literal=True) return def addRefToGraph(self): n = self.short_citation if n is None: n = self.title if self.ref_url is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) self.model.addLabel(self.ref_url, n) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, n, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
def _get_orthologs(self, src_key, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a filter to select only orthology relationships where each of the pair is found in self.tax_ids. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dcterms:evidence ECO:phylogenetic_evidence <panther_id> rdf:type DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ LOG.info("reading orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) unprocessed_gene_ids = [] src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) matchcounter = line_counter = 0 col = self.files[src_key]['columns'] reader = tarfile.open(src_file, 'r:gz') LOG.info("Parsing %s", src_key) with reader.extractfile(src_key) as csvfile: # there are no comments or headers for line in csvfile: # a little feedback to the user since there's so many ... bah strace # if line_counter % 1000000 == 0: # LOG.info("Processed %d lines from %s", line_counter, fname.name) # parse each row. ancestor_taxons is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 row = line.decode().split('\t') thing1 = row[col.index('Gene')].strip() thing2 = row[col.index('Ortholog')].strip() orthology_type = row[col.index('Type of ortholog')].strip() # ancestor_taxons = row[ # col.index('Common ancestor for the orthologs')].strip() panther_id = row[ col.index('Panther Ortholog ID')].strip() (species_a, gene_a, protein_a) = thing1.split('|') (species_b, gene_b, protein_b) = thing2.split('|') # for testing skip entries without homolog relationships to test ids if self.test_mode and not ( protein_a[9:] in self.test_ids or protein_b[9:] in self.test_ids): continue # map the species abbreviations to ncbi taxon id numbers taxon_a = self.resolve(species_a).split(':')[1].strip() taxon_b = self.resolve(species_b).split(':')[1].strip() # ### # keep orthologous relationships to genes in the given tax_ids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if self.tax_ids is not None and ( taxon_a not in self.tax_ids) and ( taxon_b not in self.tax_ids): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.append(gene_a) continue gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.append(gene_b) continue gene_b = clean_gene rel = self.resolve(orthology_type) evidence_id = self.globaltt['phylogenetic evidence'] # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; assume labels will be taken care of elsewhere model.addType(gene_a, self.globaltt['gene']) model.addType(gene_b, self.globaltt['gene']) # might as well add the taxon info for completeness graph.addTriple( gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a ) graph.addTriple( gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b ) assoc.add_association_to_graph( blv.terms['GeneToGeneHomologyAssociation'] ) # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph('PANTHER:' + panther_id) if not self.test_mode and\ limit is not None and line_counter > limit: break LOG.info("finished processing %s", src_file) LOG.warning( "The following gene ids were unable to be processed: %s", str(set(unprocessed_gene_ids)))
class Dataset: """ This class produces metadata about a dataset that is compliant with the HCLS dataset specification: https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4 Summary level: The summary level provides a description of a dataset that is independent of a specific version or format. (e.g. the Monarch ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER] Version level: The version level captures version-specific characteristics of a dataset. (e.g. the 01-02-2018 ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP] Distribution level: The distribution level captures metadata about a specific form and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is a [distribution level resource] for each different downloadable file we emit, i.e. one for the TTL file, one for the ntriples file, etc. CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format] We write out at least the following triples: SUMMARY LEVEL TRIPLES: [summary level resource] - rdf:type -> dctypes:Dataset [summary level resource] - dc:title -> title (literal) [summary level resource] - dc:description -> description (literal) (use docstring from Source class) [summary level resource] - dc:source -> [source web page, e.g. omim.org] [summary level resource] - schema:logo -> [source logo IRI] [summary level resource] - dc:publisher -> monarchinitiative.org n.b: about summary level resource triples: -- HCLS spec says we "should" link to our logo and web page, but I'm not, because it would confuse the issue of whether we are pointing to our logo/page or the logo/page of the data source for this ingest. Same below for [version level resource] and [distibution level resource] - I'm not linking to our page/logo down there either. - spec says we "should" include summary level triples describing Update frequency and SPARQL endpoint but I'm omitting this for now, because these are not clearly defined at the moment VERSION LEVEL TRIPLES: [version level resource] - rdf:type -> dctypes:Dataset [version level resource] - dc:title -> version title (literal) [version level resource] - dc:description -> version description (literal) [version level resource] - dc:created -> ingest timestamp [ISO 8601 compliant] [version level resource] - pav:version -> ingest timestamp (same one above) [version level resource] - dc:creator -> monarchinitiative.org [version level resource] - dc:publisher -> monarchinitiative.org [version level resource] - dc:isVersionOf -> [summary level resource] [version level resource] - dc:source -> [source file 1 IRI] [version level resource] - dc:source -> [source file 2 IRI] ... [source file 1 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] [source file 2 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] ... [version level resource] - pav:createdWith -> [Dipper github URI] [version level resource] - void:dataset -> [distribution level resource] [version level resource] - cito:citesAsAuthoriy -> [citation id 1] [version level resource] - cito:citesAsAuthoriy -> [citation id 2] [version level resource] - cito:citesAsAuthoriy -> [citation id 3] n.b: about version level resource triples: - spec says we "should" include Date of issue/dc:issued triple, but I'm not because it is redundant with this triple above: [version level resource] - dc:created -> time stamp and would introduce ambiguity and confusion if the two disagree. Same below for [distribution level resource] - dc:created -> tgiime stamp below Also omitting: - triples linking to our logo and page, see above. - License/dc:license triple, because we will make this triple via the [distribution level resource] below - Language/dc:language triple b/c it seems superfluous. Same below for [distribution level resource] - no language triple. - [version level resource] - pav:version triple is also a bit redundant with the pav:version triple below, but the spec requires both these triples - I'm omitting the [version level resource] -> pav:previousVersion because Dipper doesn't know this info for certain at run time. Same below for [distribution level resource] - pav:previousVersion. DISTRIBUTION LEVEL TRIPLES: [distribution level resource] - rdf:type -> dctypes:Dataset [distribution level resource] - rdf:type -> dcat:Distribution [distribution level resource] - dc:title -> distribution title (literal) [distribution level resource] - dc:description -> distribution description (lit.) [distribution level resource] - dc:created -> ingest timestamp[ISO 8601 compliant] [distribution level resource] - pav:version -> ingest timestamp (same as above) [distribution level resource] - dc:creator -> monarchinitiative.org [distribution level resource] - dc:publisher -> monarchinitiative.org [distribution level resource] - dc:license -> [license info, if available otherwise indicate unknown] [distribution level resource] - dc:rights -> [data rights IRI] [distribution level resource] - pav:createdWith -> [Dipper github URI] [distribution level resource] - dc:format -> [IRI of ttl|nt|whatever spec] [distribution level resource] - dcat:downloadURL -> [ttl|nt URI] [distribution level resource] - void:triples -> [triples count (literal)] [distribution level resource] - void:entities -> [entities count (literal)] [distribution level resource] - void:distinctSubjects -> [subject count (literal)] [distribution level resource] - void:distinctObjects -> [object count (literal)] [distribution level resource] - void:properties -> [properties count (literal)] ... n.b: about distribution level resource triples: - omitting Vocabularies used/void:vocabulary and Standards used/dc:conformTo triples, because they are described in the ttl file - also omitting Example identifier/idot:exampleIdentifier and Example resource/void:exampleResource, because we don't really have one canonical example of either - they're all very different. - [distribution level resource] - dc:created should have the exact same time stamp as this triple above: [version level resource] - dc:created -> time stamp - this [distribution level resource] - pav:version triple should have the same object as [version level resource] - pav:version triple above - Data source provenance/dc:source triples are above in the [version level resource] - omitting Byte size/dc:byteSize, RDF File URL/void:dataDump, and Linkset/void:subset triples because they probably aren't necessary for MI right now - these triples "should" be emitted, but we will do this in a later iteration: # of classes void:classPartition IRI # of literals void:classPartition IRI # of RDF graphs void:classPartition IRI Note: Do not use blank nodes in the dataset graph. This dataset graph is added to the main Dipper graph in Source.write() like so $ mainGraph = mainGraph + datasetGraph which apparently in theory could lead to blank node ID collisions between the two graphs. Note also that this implementation currently does not support producing metadata for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is currently not being used for any ingests, so this isn't a problem. There was talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which would probably require adding support here for StreamedGraph's. """ def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples() def _set_summary_level_triples(self): self.model.addType(self.summary_level_curie, self.globaltt['Dataset']) self.graph.addTriple(self.summary_level_curie, self.globaltt['title'], self.ingest_title, True) self.model.addTriple(self.summary_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) self.model.addTriple(self.summary_level_curie, "schema:logo", self.ingest_logo) self.graph.addTriple(self.summary_level_curie, self.globaltt['identifier'], self.summary_level_curie) if self.ingest_url is not None: self.graph.addTriple(self.summary_level_curie, self.globaltt["Source"], self.ingest_url) if self.ingest_description is not None: self.model.addDescription(self.summary_level_curie, self.ingest_description) def _set_version_level_triples(self): self.model.addType(self.version_level_curie, self.globaltt['Dataset']) self.graph.addTriple( self.version_level_curie, self.globaltt['title'], self.ingest_title + " Monarch version " + self.data_release_version, True) if self.ingest_description is not None: self.model.addDescription(self.version_level_curie, self.ingest_description) self.graph.addTriple( self.version_level_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple( self.version_level_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple(self.version_level_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['isVersionOf'], self.summary_level_curie, object_is_literal=False) self.graph.addTriple(self.version_level_curie, self.globaltt['distribution'], self.distribution_level_turtle_curie, object_is_literal=False) def _set_distribution_level_triples(self): self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Dataset']) self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Distribution']) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['title'], self.ingest_title + " distribution " + self.distribution_type, True) if self.ingest_description is not None: self.model.addDescription(self.distribution_level_turtle_curie, self.ingest_description) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['created_with'], "https://github.com/monarch-initiative/dipper") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['format'], "https://www.w3.org/TR/turtle/") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['downloadURL'], self.download_url) if self.license_url is None: self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['license'], 'https://project-open-data.cio.gov/unknown-license/') else: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['license'], self.license_url) if self.data_rights is not None: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['rights'], self.data_rights) self._declare_as_ontology() def set_ingest_source_file_version_num(self, file_iri, version): """ This method sets the version of a remote file or resource that is used in the ingest. It writes this triple: file_iri - 'pav:version' -> version Version is an untyped literal Note: if your version is a date or timestamp, use set_ingest_source_file_version_date() instead :param file_iri: a remote file or resource used in ingest :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD) uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], version, object_is_literal=True) def set_ingest_source_file_version_date(self, file_iri, date, datatype=XSD.date): """ This method sets the version that the source (OMIM, CTD, whatever) uses to refer to this version of the remote file/resource that was used in the ingest It writes this triple: file_iri - 'pav:version' -> date or timestamp Version is added as a literal of datatype XSD date Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source_file_version_retrieved_on(self, file_iri, date, datatype=XSD.date): """ This method sets the date on which a remote file/resource (from OMIM, CTD, etc) was retrieved. It writes this triple: file_iri - 'pav:retrievedOn' -> date or timestamp Version is added as a literal of datatype XSD date by default Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['retrieved_on'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source(self, url, predicate=None, is_object_literal=False): """ This method writes a triple to the dataset graph indicating that the ingest used a file or resource at [url] during the ingest. Triple emitted is version_level_curie dc:source [url] This triple is likely to be redundant if Source.get_files() is used to retrieve the remote files/resources, since this triple should also be emitted as files/resources are being retrieved. This method is provided as a convenience method for sources that do their own downloading of files. :param url: a remote resource used as a source during ingest :param predicate: the predicate to use for the triple ["dc:source"] from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/) "Use dc:source when the source dataset was used in whole or in part. Use pav:retrievedFrom when the source dataset was used in whole and was not modified from its original distribution. Use prov:wasDerivedFrom when the source dataset was in whole or in part and was modified from its original distribution." :return: None """ if predicate is None: predicate = self.globaltt["Source"] self.graph.addTriple(self.version_level_curie, predicate, url, object_is_literal=is_object_literal, subject_category=blv.terms['DataSetVersion']) def get_graph(self): """ This method returns the dataset graph :param :return: dataset graph """ return self.graph def get_license(self): """ This method returns the license info :param :return: license info """ return self.license_url def set_citation(self, citation_id): """ This method adds [citaton_id] argument to the set of citations, and also adds a triple indicating that version level cito:citesAsAuthority [citation_id] :param: citation_id :return: none """ self.citation.add(citation_id) self.graph.addTriple(self.version_level_curie, self.globaltt['citesAsAuthority'], citation_id) def _declare_as_ontology(self, version_info=None): """ Declare the distribution level IRI as an ontology, and also make triple distribution level IRI - version_iri -> version level IRI TEC: I am not convinced dipper reformatting external data as RDF triples makes an OWL ontology (nor that it should be considered a goal). Proper ontologies are built by ontologists. Dipper reformats data and annotates/decorates it with a minimal set of carefully arranged terms drawn from from multiple proper ontologies. Which allows the whole (dipper's RDF triples and parent ontologies) to function as a single ontology we can reason over when combined in a store such as SciGraph. Including more than the minimal ontological terms in dipper's RDF output constitutes a liability as it allows greater divergence between dipper artifacts and the proper ontologies. :param version_info: a string describing version info for the ontology :return: """ model = Model(self.graph) model.addOntologyDeclaration(self.summary_level_curie) model.addOWLVersionIRI(self.summary_level_curie, self.version_level_curie) if version_info is not None: model.addOWLVersionInfo(self.distribution_level_turtle_curie, version_info) @staticmethod def make_id(long_string, prefix='MONARCH'): """ A method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 Duplicated from Source.py to avoid circular imports. :param long_string: string to use to generate identifier :param prefix: prefix to prepend to identifier [Monarch] :return: a Monarch identifier """ return ':'.join((prefix, Dataset.hash_id(long_string))) @staticmethod def hash_id(word): # same as graph/GraphUtils.digest_id(wordage) """ Given a string, make a hash Duplicated from Source.py. :param word: str string to be hashed :return: hash of id """ return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) line_counter = 0 impc_map = self.open_and_parse_yaml(self.map_files['impc_map']) impress_map = json.loads( self.fetch_from_url( self.map_files['impress_map']).read().decode('utf-8')) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony) if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_:IMPC-'+re.sub(r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_:seqalt'+re.sub(r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' model.addIndividualToGraph(colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) g.addTriple(colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType( vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '-' + phenotyping_center + '-' + colony pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning("No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = "ECO:0000015" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add a free-text description try: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = \ self._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = \ self._add_evidence( assoc_id, eco_id, impc_map, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode, impc_map) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = {'association': 'OBAN:association'} annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', 'inchi_key': 'CHEBI:InChIKey', 'probabalistic_quantifier': 'GENO:0000867' } object_properties = { 'has disposition': 'RO:0000091', 'has_phenotype': 'RO:0002200', 'expressed_in': 'RO:0002206', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_predicate', 'is_about': 'IAO:0000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance', 'causes_or_contributes': 'RO:0003302' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004', 'has_quantifier': 'GENO:0000866', 'created_on': 'pav:createdOn' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() self.model.addType(self.assoc_id, self.assoc_types['association']) self.graph.addTriple(self.assoc_id, self.object_properties['has_subject'], self.sub) self.graph.addTriple(self.assoc_id, self.object_properties['has_object'], self.obj) self.graph.addTriple(self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.graph.addTriple(self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.graph.addTriple(self.assoc_id, self.object_properties['has_source'], s, True) else: self.graph.addTriple(self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.graph.addTriple(self.assoc_id, self.object_properties['has_provenance'], p) if self.date is not None and len(self.date) > 0: for d in self.date: self.graph.addTriple( object_is_literal=True, subject_id=self.assoc_id, predicate_id=self.datatype_properties['created_on'], obj=d) if self.score is not None: self.graph.addTriple(self.assoc_id, self.properties['has_measurement'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self): self._add_basic_association_to_graph() return def add_predicate_object(self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple(self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but if we must, # prefer use of property decorator def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource # (although the point of a digest is to render such details moot). items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join( ('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def _is_valid(self): # check if sub/obj/rel are none...raise error if self.sub is None: raise ValueError( 'No subject set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.obj is None: raise ValueError( 'No object set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.rel is None: raise ValueError( 'No predicate set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) # Are subject & predicate, either a curie or IRI pfx = self.sub.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Subject for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) pfx = self.rel.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Predicate for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) return True def add_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() assert self.assoc_id is not None self.model.addType(self.assoc_id, self.model.globaltt['association']) self.graph.addTriple( self.assoc_id, self.globaltt['association has subject'], self.sub) self.graph.addTriple( self.assoc_id, self.globaltt['association has object'], self.obj) self.graph.addTriple( self.assoc_id, self.globaltt['association has predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for evi in self.evidence: self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi) if self.source is not None and len(self.source) > 0: for src in self.source: # TODO assume that the source is a publication? use Reference class self.graph.addTriple(self.assoc_id, self.globaltt['source'], src) if self.provenance is not None and len(self.provenance) > 0: for prov in self.provenance: self.graph.addTriple( self.assoc_id, self.globaltt['has_provenance'], prov) if self.date is not None and len(self.date) > 0: for dat in self.date: self.graph.addTriple( self.assoc_id,self.globaltt['created_on'], dat, object_is_literal=True) if self.score is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has measurement value'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_predicate_object( self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple( self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but predecessors favored the use of property decorators # and CamelCase and ... def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id( self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return self.assoc_id def get_association_id(self): if self.assoc_id is None: self.set_association_id() return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, sub, pred, obj, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. Note this is equivalent to a RDF blank node :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ items_to_hash = [definedby, sub, pred, obj] if attributes is not None and len(attributes) > 0: items_to_hash += attributes items_to_hash = [x for x in items_to_hash if x is not None] assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash)))) assert assoc_id is not None return assoc_id
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) line_counter = 0 # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 ( marker_accession_id, marker_symbol, phenotyping_center, colony_raw, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name ) = row if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", line_counter) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': LOG.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, line_counter) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and line_counter > limit: break return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ def __init__( self, graph, feature_id=None, label=None, feature_type=None, description=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.fid = feature_id self.label = label self.ftype = feature_type self.description = description self.start = None self.stop = None self.taxon = None return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.globaltt['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.globaltt['plus_strand'] elif strand == '-': strand_id = self.globaltt['minus_strand'] elif strand == '.': strand_id = self.globaltt['both_strand'] elif strand is None: # assume this is Unknown pass else: LOG.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.model.addClassToGraph( self.fid, self.label, self.ftype, self.description) else: self.model.addIndividualToGraph( self.fid, self.label, self.ftype, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes(self.start['type']) if self.stop is not None and self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_:'+rid+"-Region" region_id = rid self.graph.addTriple(self.fid, self.globaltt['location'], region_id) self.model.addIndividualToGraph(region_id, None, self.globaltt['Region']) else: region_id = self.fid self.model.addType(region_id, self.globaltt['region']) # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId( self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph( self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.globaltt['plus_strand'] in tylist: strand = 'plus' elif self.globaltt['minus_strand'] in tylist: strand = 'minus' elif self.globaltt['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: LOG.error("Trying to make position with no reference.") return None curie = '_:' reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) curie += reference if coordinate is not None: # just in case it isn't a string already curie = '-'.join((curie, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: curie = '-'.join((curie, tstring)) return curie def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # LOG.warn("No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id) if end_position_id is None: pass # LOG.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['end'], end_position_id) return def addPositionToGraph( self, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple( pos_id, self.globaltt['position'], position, object_is_literal=True, literal_type="xsd:integer") self.graph.addTriple(pos_id, self.globaltt['reference'], reference_id) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) strnd = None if strand is not None: strnd = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it strnd = self._getStrandType(strand) # else: # strnd = self.globaltt['both_strand'] if strnd is None and (position_types is None or position_types == []): strnd = self.globaltt['Position'] if strnd is not None: self.model.addType(pos_id, strnd) return pos_id def addSubsequenceOfFeature(self, parentid): """ This will add reciprocal triples like: feature <is subsequence of> parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple(self.fid, self.globaltt['is subsequence of'], parentid) # this should be expected to be done in reasoning not ETL self.graph.addTriple(parentid, self.globaltt['has subsequence'], self.fid) return def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ self.taxon = taxonid self.graph.addTriple(self.fid, self.globaltt['in taxon'], self.taxon) return def addFeatureProperty(self, property_type, feature_property): self.graph.addTriple(self.fid, property_type, feature_property) return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ def __init__( self, graph, definedby, sub=None, obj=None, pred=None, subject_category=None, object_category=None ): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.subject_category = subject_category self.object_category = object_category self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None def _is_valid(self): # check if sub/obj/rel are none...raise error if self.sub is None: raise ValueError( 'No subject set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.obj is None: raise ValueError( 'No object set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.rel is None: raise ValueError( 'No predicate set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) # Are subject & predicate, either a curie or IRI pfx = self.sub.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Subject for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) pfx = self.rel.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Predicate for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) return True def add_association_to_graph(self, association_category=None): # Assume null and iri checks happen downstream #if not self._is_valid(): # return self.graph.addTriple(self.sub, self.rel, self.obj, subject_category=self.subject_category, object_category=self.object_category) if self.assoc_id is None: self.set_association_id() # assert self.assoc_id is not None self.model.addType(self.assoc_id, self.model.globaltt['association']) self.graph.addTriple( self.assoc_id, self.globaltt['association has subject'], self.sub ) self.graph.addTriple( self.assoc_id, self.globaltt['association has object'], self.obj ) self.graph.addTriple( self.assoc_id, self.globaltt['association has predicate'], self.rel ) if association_category is not None: self.graph.addTriple( self.assoc_id, blv.terms['category'], association_category ) if self.description: self.model.addDescription(self.assoc_id, self.description) if self.evidence: for evi in self.evidence: self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi) if self.source: for src in self.source: # TODO assume that the source is a publication? use Reference class self.graph.addTriple(self.assoc_id, self.globaltt['Source'], src) if self.provenance: for prov in self.provenance: self.graph.addTriple( self.assoc_id, self.globaltt['has_provenance'], prov) if self.date: for dat in self.date: self.graph.addTriple( self.assoc_id, self.globaltt['created_on'], dat, object_is_literal=True ) if self.score is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has measurement value'], self.score, True, 'xsd:float' ) # TODO # update with some kind of instance of scoring object # that has a unit and type def add_predicate_object( self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple( self.assoc_id, predicate, object_node, True, datatype ) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) # This isn't java, but predecessors favored the use of property decorators # and CamelCase and ... def set_subject(self, identifier): self.sub = identifier def set_object(self, identifier): self.obj = identifier def set_relationship(self, identifier): self.rel = identifier def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id( self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return self.assoc_id def get_association_id(self): if self.assoc_id is None: self.set_association_id() return self.assoc_id def set_description(self, description): self.description = description def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] @staticmethod def make_association_id(definedby, sub, pred, obj, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. Note this is equivalent to a RDF blank node :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ items_to_hash = [definedby, sub, pred, obj] if attributes is not None and len(attributes) > 0: items_to_hash += attributes items_to_hash = [x for x in items_to_hash if x is not None] assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash)))) # assert assoc_id is not None return assoc_id
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__( self, identifier, # name? should be Archive url via Source title, url, ingest_desc=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, identifier, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, identifier) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # TODO: move hard coded curies to translation table calls self.identifier = identifier if title is None: self.title = identifier else: self.title = title self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dcterms:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license_url = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dcterms:title', title, True) self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier, True) if url is not None: self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <uri> # TODO add the license info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dcterms:license', license_url) else: LOG.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dcterms:rights', data_rights, object_is_literal=True) else: LOG.debug('No rights provided.') if ingest_desc is not None: self.model.addDescription(self.identifier, ingest_desc) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: LOG.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: LOG.info("set version to %s", self.version) self.set_version_by_date(date_issued) LOG.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple(self.identifier, 'dcterms:issued', date_issued, object_is_literal=True) LOG.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: dat = date_issued elif self.date_issued is not None: dat = self.date_issued else: dat = self.date_accessed LOG.info( "No date supplied, using download timestamp for date_issued") LOG.info("setting version by date to: %s", dat) self.set_version_by_num(dat) return def set_version_by_num(self, version_num): self.version = self.identifier + version_num self.graph.addTriple(self.version, 'dcterms:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) LOG.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple(dipperized_version, 'dcterms:isVersionOf', "MonarchData:" + self.identifier + ".ttl") # fix suffix self.graph.addTriple(dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple(dipperized_version, 'dcterms:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license_url): self.license_url = license_url return def get_license(self): return self.license_url def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ ref_types = { 'person': 'foaf:Person', 'journal_article': 'IAO:0000013', 'publication': 'IAO:0000311', # book 'document': 'IAO:0000310', # document??? 'photograph': 'IAO:0000185', 'webpage': 'SIO:000302', } annotation_properties = { 'page': 'foaf:page', 'title': 'dc:title' } def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) if ref_type is None: self.ref_type = self.ref_types['document'] else: self.ref_type = ref_type if ref_id is not None and re.match(r'http', ref_id): self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple( subject_id, self.annotation_properties['page'], page_url, object_is_literal=True) return def addTitle(self, subject_id, title): self.graph.addTriple( subject_id, self.annotation_properties['title'], title, object_is_literal=True) return def addRefToGraph(self): n = self.short_citation if n is None: n = self.title if self.ref_url is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) self.model.addLabel(self.ref_url, n) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, n, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true logger.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for a in self.author_list: # gu.addTriple( # g, self.ref_id, self.props['has_author'], a, True) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ def __init__( self, graph, feature_id=None, label=None, feature_type=None, description=None, feature_category=None ): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.gfxutl = GraphUtils(self.curie_map) self.fid = feature_id self.feature_category = feature_category self.label = label self.ftype = feature_type self.description = description self.start = None self.stop = None self.taxon = None def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None ): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None ): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.globaltt['Position']) return loc def _getStrandType(self, strand): """ :param strand: """ strand_id = None if strand == '+': strand_id = self.globaltt['plus_strand'] elif strand == '-': strand_id = self.globaltt['minus_strand'] elif strand == '.': strand_id = self.globaltt['both_strand'] elif strand is None: # assume this is Unknown pass else: LOG.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, add_region=True, region_id=None, feature_as_class=False, feature_category=None): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param add_region [True] :param region_id [None] :param feature_as_class [False] :param feature_category: a biolink category CURIE for feature """ if feature_category is None: feature_category = self.feature_category if feature_as_class: self.model.addClassToGraph( self.fid, self.label, self.ftype, self.description, class_category=feature_category) else: self.model.addIndividualToGraph( self.fid, self.label, self.ftype, self.description, ind_category=feature_category) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes(self.start['type']) if self.stop is not None and self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix # blank node, bnode rid = rid + "-Region" curie = '_:' + self.gfxutl.digest_id(rid) self.model.addLabel(curie, rid) region_id = curie self.graph.addTriple( self.fid, self.globaltt['location'], region_id, subject_category=feature_category ) self.model.addIndividualToGraph(region_id, None, self.globaltt['Region']) else: region_id = self.fid self.model.addType(region_id, self.globaltt['region']) # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId( self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph( self.start['reference'], self.start['coordinate'], self.start['type'], ) if self.stop is not None: endp = self._makePositionId( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph( self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.globaltt['plus_strand'] in tylist: strand = 'plus' elif self.globaltt['minus_strand'] in tylist: strand = 'minus' elif self.globaltt['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: bnode_curie """ # blank node, bnode if reference is None: LOG.error("Trying to make position with no reference.") return None reference = re.sub(r'\w+\:', '', reference, 1) if reference[0] == '_': # in this case the reference is a bnode curie as well # ... this is a bad smell of over modleing reference = reference[1:] unique_words = reference if coordinate is not None: # just in case it isn't a string already unique_words = '-'.join((unique_words, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: unique_words = '-'.join((unique_words, tstring)) curie = '_:' + self.gfxutl.digest_id(unique_words) # attach the wordage via a label # I want to see more of this (TEC 201905) # including a type should be mandatory as well self.model.addLabel(curie, unique_words) return curie def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # LOG.warn("No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id) if end_position_id is None: pass # LOG.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.globaltt['end'], end_position_id) def addPositionToGraph( self, reference_id, position, position_types=None, strand=None ): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple( pos_id, self.globaltt['position'], position, object_is_literal=True, literal_type="xsd:integer" ) self.graph.addTriple( pos_id, self.globaltt['reference'], reference_id ) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) strnd = None if strand is not None: strnd = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it strnd = self._getStrandType(strand) # else: # strnd = self.globaltt['both_strand'] if strnd is None and (position_types is None or position_types == []): strnd = self.globaltt['Position'] if strnd is not None: self.model.addType(pos_id, strnd) return pos_id def addSubsequenceOfFeature( self, parentid, subject_category=None, object_category=None ): """ This will add reciprocal triples like: feature <is subsequence of> parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple( self.fid, self.globaltt['is subsequence of'], parentid, subject_category=subject_category, object_category=object_category ) # this should be expected to be done in reasoning not ETL self.graph.addTriple( parentid, self.globaltt['has subsequence'], self.fid, subject_category=object_category, object_category=subject_category ) def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ self.taxon = taxonid self.graph.addTriple( self.fid, self.globaltt['in taxon'], self.taxon, subject_category=self.feature_category ) def addFeatureProperty(self, property_type, feature_property): self.graph.addTriple( self.fid, property_type, feature_property, subject_category=self.feature_category )
def _parse_g2p_file(self, limit=None): """ Parse gene to XPO file, currently custom for Monarch :param limit: :return: """ src_key = 'g2p_assertions' geno = Genotype(self.graph) model = Model(self.graph) columns = self.files[src_key]['columns'] raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Gene to XPO associations") with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile) # File has headers row = next(reader) if not self.check_fileheader(columns, row): pass for row in reader: gene = row[columns.index('SUBJECT')] gene_label = row[columns.index('SUBJECT_LABEL')] gene_taxon = row[columns.index('SUBJECT_TAXON')] #gene_taxon_label = row[columns.index('SUBJECT_TAXON_LABEL')] phenotype_curie = row[columns.index('OBJECT')] #phenotype_label = row[columns.index('OBJECT_LABEL')] relation = row[columns.index('RELATION')] #relation_label = row[columns.index('RELATION_LABEL')] evidence = row[columns.index('EVIDENCE')] #evidence_label = row[columns.index('EVIDENCE_LABEL')] source = row[columns.index('SOURCE')] #is_defined_by = row[columns.index('IS_DEFINED_BY')] #qualifier = row[columns.index('QUALIFIER')] gene_curie = 'Xenbase:' + gene relation_curie = relation.replace('_', ':') geno.addGene(gene_curie, gene_label) geno.addTaxon(gene_taxon, gene_curie) assoc = G2PAssoc( self.graph, self.name, entity_id=gene_curie, phenotype_id=phenotype_curie, rel=relation_curie ) if evidence: assoc.add_evidence(evidence) if source: model.addType(source, self.globaltt['journal article']) assoc.add_source(source) assoc.add_association_to_graph() if not self.test_mode and limit is not None and reader.line_num > limit: break
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ LOG.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: graph = self.graph model = Model(graph) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:' + biogrid_num prefix = self.localtt[id_type] # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC,WormBase,XenBase,FlyBase'.split( ',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addLabel(biogrid_id, id_num) model.addType(biogrid_id, self.globaltt['gene']) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph() elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format. # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M') self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple(self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: logger.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: logger.info("set version to %s", self.version) self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple(self.identifier, 'dct:issued', date_issued, object_is_literal=True) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info("No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): self.version = self.identifier + version_num self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple(dipperized_version, 'dct:isVersionOf', self.version) self.graph.addTriple(dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple(dipperized_version, 'dct:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ object_properties = { 'location': 'faldo:location', 'begin': 'faldo:begin', 'end': 'faldo:end', 'reference': 'faldo:reference', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205', 'is_about': 'IAO:0000136', 'has_subsequence': 'RO:0002524', 'is_subsequence_of': 'RO:0002525', 'has_staining_intensity': 'GENO:0000207', 'upstream_of_sequence_of': 'RO:0002528', 'downstream_of_sequence_of': 'RO:0002529' } data_properties = { 'position': 'faldo:position', } annotation_properties = {} properties = object_properties.copy() properties.update(data_properties) properties.update(annotation_properties) types = { 'region': 'faldo:Region', 'Position': 'faldo:Position', # big P for Position type. little p for position property 'FuzzyPosition': 'faldo:FuzzyPosition', 'chromosome': 'SO:0000340', 'chromosome_arm': 'SO:0000105', 'chromosome_band': 'SO:0000341', 'chromosome_part': 'SO:0000830', 'long_chromosome_arm': 'GENO:0000629', 'short_chromosome_arm': 'GENO:0000628', 'chromosome_region': 'GENO:0000614', 'chromosome_subband': 'GENO:0000616', 'centromere': 'SO:0000577', 'plus_strand': 'faldo:PlusStrandPosition', 'minus_strand': 'faldo:MinusStrandPosition', 'both_strand': 'faldo:BothStrandPosition', 'score': 'SO:0001685', # FIXME - score is not a good solution, too generic 'reference_genome': 'SO:0001505', 'genome': 'SO:0001026', 'assembly_component': 'SO:0000143', 'SNP': 'SO:0000694', 'haplotype': 'GENO:0000871', # the following are sequence attributes: 'band_intensity': 'GENO:0000618', 'gneg': 'GENO:0000620', 'gpos': 'GENO:0000619', 'gpos100': 'GENO:0000622', 'gpos75': 'GENO:0000623', 'gpos50': 'GENO:0000624', 'gpos25': 'GENO:0000625', 'gvar': 'GENO:0000621', 'gpos33': 'GENO:0000633', 'gpos66': 'GENO:0000632' } def __init__(self, graph, feature_id=None, label=None, feature_type=None, description=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) self.id = feature_id self.label = label self.type = feature_type self.description = description self.start = None self.stop = None return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = dict() loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.types['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.types['plus_strand'] elif strand == '-': strand_id = self.types['minus_strand'] elif strand == '.': strand_id = self.types['both_strand'] elif strand is None: # assume this is Unknown pass else: logger.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.model.addClassToGraph(self.id, self.label, self.type, self.description) else: self.model.addIndividualToGraph(self.id, self.label, self.type, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and \ self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes( self.start['type']) if self.stop is not None and\ self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_:'+rid+"-Region" region_id = rid self.graph.addTriple(self.id, self.properties['location'], region_id) self.model.addIndividualToGraph(region_id, None, 'faldo:Region') else: region_id = self.id self.model.addType(region_id, 'faldo:Region') # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId(self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph(self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.types['plus_strand'] in tylist: strand = 'plus' elif self.types['minus_strand'] in tylist: strand = 'minus' elif self.types['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: logger.error("Trying to make position with no reference.") return None curie = '_:' reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) curie += reference if coordinate is not None: # just in case it isn't a string already curie = '-'.join((curie, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: curie = '-'.join((curie, tstring)) return curie def addRegionPositionToGraph( self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # logger.warn( # "No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.properties['begin'], begin_position_id) if end_position_id is None: pass # logger.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.properties['end'], end_position_id) return def addPositionToGraph( self, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple(pos_id, self.properties['position'], position, object_is_literal=True, literal_type="xsd:integer") self.graph.addTriple( pos_id, self.properties['reference'], reference_id) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) s = None if strand is not None: s = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it s = self._getStrandType(strand) # else: # s = self.types['both_strand'] if s is None and (position_types is None or position_types == []): s = self.types['Position'] if s is not None: self.model.addType(pos_id, s) return pos_id def addSubsequenceOfFeature(self, parentid): """ This will add reciprocal triples like: feature is_subsequence_of parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple( self.id, self.properties['is_subsequence_of'], parentid) self.graph.addTriple( parentid, self.properties['has_subsequence'], self.id) return def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ # TEC: should taxon be set in __init__()? self.taxon = taxonid self.graph.addTriple( self.id, Assoc.properties['in_taxon'], self.taxon) return def addFeatureProperty(self, property_type, property): self.graph.addTriple(self.id, property_type, property) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ object_properties = { 'location': 'faldo:location', 'begin': 'faldo:begin', 'end': 'faldo:end', 'reference': 'faldo:reference', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205', 'is_about': 'IAO:0000136', 'has_subsequence': 'RO:0002524', 'is_subsequence_of': 'RO:0002525', 'has_staining_intensity': 'GENO:0000207', 'upstream_of_sequence_of': 'RO:0002528', 'downstream_of_sequence_of': 'RO:0002529' } data_properties = { 'position': 'faldo:position', } annotation_properties = {} properties = object_properties.copy() properties.update(data_properties) properties.update(annotation_properties) types = { 'region': 'faldo:Region', 'Position': 'faldo:Position', # big P for Position type. little p for position property 'FuzzyPosition': 'faldo:FuzzyPosition', 'chromosome': 'SO:0000340', 'chromosome_arm': 'SO:0000105', 'chromosome_band': 'SO:0000341', 'chromosome_part': 'SO:0000830', 'long_chromosome_arm': 'GENO:0000629', 'short_chromosome_arm': 'GENO:0000628', 'chromosome_region': 'GENO:0000614', 'chromosome_subband': 'GENO:0000616', 'centromere': 'SO:0000577', 'plus_strand': 'faldo:PlusStrandPosition', 'minus_strand': 'faldo:MinusStrandPosition', 'both_strand': 'faldo:BothStrandPosition', 'score': 'SO:0001685', # FIXME - score is not a good solution, too generic 'reference_genome': 'SO:0001505', 'genome': 'SO:0001026', 'assembly_component': 'SO:0000143', 'SNP': 'SO:0000694', 'haplotype': 'GENO:0000871', # the following are sequence attributes: 'band_intensity': 'GENO:0000618', 'gneg': 'GENO:0000620', 'gpos': 'GENO:0000619', 'gpos100': 'GENO:0000622', 'gpos75': 'GENO:0000623', 'gpos50': 'GENO:0000624', 'gpos25': 'GENO:0000625', 'gvar': 'GENO:0000621', 'gpos33': 'GENO:0000633', 'gpos66': 'GENO:0000632' } def __init__(self, graph, feature_id=None, label=None, feature_type=None, description=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) self.id = feature_id self.label = label self.type = feature_type self.description = description self.start = None self.stop = None return def addFeatureStartLocation(self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation(self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = dict() loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.types['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.types['plus_strand'] elif strand == '-': strand_id = self.types['minus_strand'] elif strand == '.': strand_id = self.types['both_strand'] elif strand is None: # assume this is Unknown pass else: logger.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph(self, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.model.addClassToGraph(self.id, self.label, self.type, self.description) else: self.model.addIndividualToGraph(self.id, self.label, self.type, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and \ self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes( self.start['type']) if self.stop is not None and\ self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_:' + rid + "-Region" region_id = rid self.graph.addTriple(self.id, self.properties['location'], region_id) self.model.addIndividualToGraph(region_id, None, 'faldo:Region') else: region_id = self.id self.model.addType(region_id, 'faldo:Region') # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId(self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph(self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.types['plus_strand'] in tylist: strand = 'plus' elif self.types['minus_strand'] in tylist: strand = 'minus' elif self.types['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: logger.error("Trying to make position with no reference.") return None curie = '_:' reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) curie += reference if coordinate is not None: # just in case it isn't a string already curie = '-'.join((curie, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: curie = '-'.join((curie, tstring)) return curie def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # logger.warn( # "No begin position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.properties['begin'], begin_position_id) if end_position_id is None: pass # logger.warn("No end position specified for region %s", region_id) else: self.graph.addTriple(region_id, self.properties['end'], end_position_id) return def addPositionToGraph(self, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ pos_id = self._makePositionId(reference_id, position, position_types) if position is not None: self.graph.addTriple(pos_id, self.properties['position'], position, object_is_literal=True, literal_type="xsd:integer") self.graph.addTriple(pos_id, self.properties['reference'], reference_id) if position_types is not None: for pos_type in position_types: self.model.addType(pos_id, pos_type) s = None if strand is not None: s = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it s = self._getStrandType(strand) # else: # s = self.types['both_strand'] if s is None and (position_types is None or position_types == []): s = self.types['Position'] if s is not None: self.model.addType(pos_id, s) return pos_id def addSubsequenceOfFeature(self, parentid): """ This will add reciprocal triples like: feature is_subsequence_of parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.graph.addTriple(self.id, self.properties['is_subsequence_of'], parentid) self.graph.addTriple(parentid, self.properties['has_subsequence'], self.id) return def addTaxonToFeature(self, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ # TEC: should taxon be set in __init__()? self.taxon = taxonid self.graph.addTriple(self.id, Assoc.properties['in_taxon'], self.taxon) return def addFeatureProperty(self, property_type, property): self.graph.addTriple(self.id, property_type, property) return
class Reference: """ To model references for associations (such as journal articles, books, etc.). By default, references will be typed as "documents", unless if the type is set otherwise. If a short_citation is set, this will be used for the individual's label. We may wish to subclass this later. """ def __init__(self, graph, ref_id=None, ref_type=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("%s is not a graph", graph) # assert ref_id is not None self.ref_id = ref_id self.ref_url = None self.title = None self.year = None self.author_list = None self.short_citation = None self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map if ref_type is None: self.ref_type = self.globaltt['document'] else: self.ref_type = ref_type if ref_type[:4] not in ('IAO:', 'SIO:'): LOG.warning("Got Pub ref type of: %s", ref_type) if ref_id is not None and ref_id[:4] == 'http': self.ref_url = ref_id return def setTitle(self, title): self.title = title return def setYear(self, year): self.year = year return def setType(self, reference_type): self.ref_type = reference_type return def setAuthorList(self, author_list): """ :param author_list: Array of authors :return: """ self.author_list = author_list return def addAuthor(self, author): self.author_list += [author] return def setShortCitation(self, citation): self.short_citation = citation return def addPage(self, subject_id, page_url): self.graph.addTriple( subject_id, self.globaltt['page'], # foaf:page not <sio:web page> page_url, object_is_literal=True) return def addTitle(self, subject_id, title): if title is not None and title != '': self.graph.addTriple( subject_id, self.globaltt['title (dce)'], title, object_is_literal=True) return def addRefToGraph(self): cite = self.short_citation if cite is None and self.title is not None: cite = self.title if self.ref_url is not None: if self.title is not None: self.addTitle(self.ref_url, self.title) self.model.addType(self.ref_url, self.ref_type) if cite is not None: self.model.addLabel(self.ref_url, cite) elif self.ref_id is not None: self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type) if self.title is not None: self.addTitle(self.ref_id, self.title) else: # should never be true LOG.error("You are missing an identifier for a reference.") # TODO what is the property here to add the date? # if self.year is not None: # gu.addTriple() # if self.author_list is not None: # for auth in self.author_list: # gu.addTriple( # graph, self.ref_id, self.props['has_author'], auth, True) return
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
def _process_data(self, src_key, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[src_key]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning( 'Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.test_mode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning( 'Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join(( patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join(( patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph( cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph( equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph( family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:'+re.sub( 'MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature( graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts( karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': varl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((varl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = varl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = varl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple( patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for disease in omim_num.split(';'): if disease is not None and disease != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if disease not in omim_map: disease_id = 'OMIM:' + disease.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple( cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', disease) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for pmid in pubmed_ids.split(';'): pubmed_id = 'PMID:' + pmid.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple( pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.test_mode and ( limit is not None and line_counter > limit): break return
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def parse(self, limit=None): model = Model(self.graph) geno = Genotype(self.graph) count = 0 for num in range(10, 100): fuzzy_gene = "MGI:{0}*".format(num) gene = "MGI:{0}".format(num) service = Service("http://www.mousemine.org/mousemine/service") logging.getLogger('Model').setLevel(logging.ERROR) logging.getLogger('JSONIterator').setLevel(logging.ERROR) query = service.new_query("OntologyAnnotation") query.add_constraint("subject", "SequenceFeature") query.add_constraint("ontologyTerm", "MPTerm") query.add_view("subject.primaryIdentifier", "subject.symbol", "subject.sequenceOntologyTerm.name", "ontologyTerm.identifier", "ontologyTerm.name", "evidence.publications.pubMedId", "evidence.comments.type", "evidence.comments.description") query.add_constraint("subject.organism.taxonId", "=", self.txid, code="A") query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B") query.add_constraint("subject.primaryIdentifier", "CONTAINS", gene, code="C") query.outerjoin("evidence.comments") for row in query.rows(): mgi_curie = row["subject.primaryIdentifier"] mp_curie = row["ontologyTerm.identifier"] pub_curie = "PMID:{0}".format( row["evidence.publications.pubMedId"]) model.addType(mgi_curie, self.globaltt['gene']) geno.addTaxon('NCBITaxon:' + self.txid, mgi_curie) assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie) if row["evidence.publications.pubMedId"]: reference = Reference(self.graph, pub_curie, self.globaltt['journal article']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() if not count % 10 and count != 0: count_from = count - 10 LOG.info("%s processed ids from MGI:%i* to MGI:%i*", datetime.datetime.now(), count_from, count) count += 1 if limit and count >= limit: break return