def _add_deprecated_snp(self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' current_rs_id += str(snp_id_current) if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual(snp_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id, unknown_field) = row zp_id = zfin_parser._map_sextuple_to_phenotype( superterm1_id, subterm1_id, quality_id, superterm2_id, subterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, Reference.ref_types['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph()
def _add_deprecated_snp( self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' current_rs_id += str(snp_id_current) if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual(snp_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple( gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _add_deprecated_snp(self, snp_id, snp_id_current, merged, chrom_num, chrom_pos): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) location = self._make_location_curie(chrom_num, chrom_pos) # add deprecation information if merged == '1' and snp_id_current != '': current_rs_id = 'dbSNP:rs' + snp_id_current if location is not None: if location not in self.id_location_map: self.id_location_map[location] = set(current_rs_id) else: self.id_location_map[location].add(current_rs_id) model.addDeprecatedIndividual( snp_id, current_rs_id, old_id_category=blv.terms['SequenceVariant']) # TODO check on this # should we add the annotations to the current # or orig? model.makeLeader(current_rs_id) else: model.makeLeader(snp_id)
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) zp_file = '/'.join((self.rawdir, self.files['zpmap']['file'])) g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file'])) zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file) with open(g2p_file, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (internal_id, symbol, gene_id, subterm1_id, subterm1_label, pc_rel_id, pc_rel_label, superterm1_id, superterm1_label, quality_id, quality_name, modifier, subterm2_id, subterm2_label, pc_rel2_id, pc_rel2_id, superterm2_id, superterm2_label, fish_id, fish_label, start_stage, end_stage, environment, pub_id, figure_id, unknown_field) = row zp_id = zfin_parser._map_sextuple_to_phenotype( superterm1_id, subterm1_id, quality_id, superterm2_id, subterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, Reference.ref_types['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence('ECO:0000059') assoc.add_association_to_graph()
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # These will be made xrefs taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']} if taxon in taxon_spec_xref_filters: taxon_spec_filters = taxon_spec_xref_filters[taxon] else: taxon_spec_filters = [] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple(gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if xref_curie.split(':')[0] in taxon_spec_xref_filters: model.addXref(gene_id, xref_curie) if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split( ':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split( ':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] taxon_spec_filters = { '10090': ['ENSEMBL'] } if taxon in taxon_spec_filters: filter_out += taxon_spec_filters[taxon] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple( gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass( gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split(':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def parse(self, limit=None): zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized) model = Model(self.graph) src_key = 'zpmap' # keep same-as zfin.files[key] zfin_parser.zp_map = zfin_parser._load_zp_mappings(src_key) src_key = 'g2p_clean' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing clean Geno to Pheno from file: %s", raw) col = self.files[src_key]['columns'] collen = len(col) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: if len(row) != collen: LOG.warning('Row: %i has unexpected format', reader.line_num) # internal_id = row[col.index('ID')] # symbol = row[col.index('Gene Symbol')] gene_id = row[col.index('Gene ID')] subterm1_id = row[col.index( 'Affected Structure or Process 1 subterm ID')] # subterm1_label = row[col.index( # 'Affected Structure or Process 1 subterm Name')] pc_rel_id = row[col.index( 'Post-composed Relationship ID')].strip() # pc_rel_label = row[col.index('Post-composed Relationship Name')] superterm1_id = row[col.index( 'Affected Structure or Process 1 superterm ID')].strip() # superterm1_label = row[col.index( # 'Affected Structure or Process 1 superterm Name')] quality_id = row[col.index('Phenotype Keyword ID')].strip() # quality_name = row[col.index('Phenotype Keyword Name')] modifier = row[col.index('Phenotype Tag')].strip() subterm2_id = row[col.index( 'Affected Structure or Process 2 subterm ID')].strip() # subterm2_label = row[col.index( # 'Affected Structure or Process 2 subterm name')] pc_rel2_id = row[col.index( 'Post-composed Relationship (rel) ID')] # pc_rel2_label = row[col.index( # 'Post-composed Relationship (rel) Name')] superterm2_id = row[col.index( 'Affected Structure or Process 2 superterm ID')].strip() # superterm2_label = row[col.index( # 'Affected Structure or Process 2 superterm name')] # fish_id = row[col.index('Fish ID')] # fish_label = row[col.index('Fish Display Name')] start_stage = row[col.index('Start Stage ID')] # end_stage = row[col.index('End Stage ID')] # environment = row[col.index('Fish Environment ID')] pub_id = row[col.index('Publication ID')].strip() # figure_id = row[col.index('Figure ID')] if modifier != 'abnormal': LOG.warning( "skipping phenotype with modifier %s != abnormal ", modifier) continue zp_id = zfin_parser._map_octuple_to_phenotype( subterm1_id, pc_rel_id, superterm1_id, quality_id, subterm2_id, pc_rel2_id, superterm2_id, modifier) gene_curie = "ZFIN:{0}".format(gene_id) model.makeLeader(gene_curie) pub_curie = "ZFIN:{0}".format(pub_id) if zp_id: assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id) if pub_id: reference = Reference(self.graph, pub_curie, self.globaltt['document']) reference.addRefToGraph() assoc.add_source(pub_curie) assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph()
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:' + tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs' + str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:' + dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_' + gene_num + '-' + variant_num if self.nobnodes: vl_id = ':' + vl_id vl_label = allele_name model.addIndividualToGraph(vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub(m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub(r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub(r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc(g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:' + xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _get_var_citations(self, limit): # Generated weekly, the first of the week # A tab-delimited report of citations associated with data in ClinVar, # connected to the AlleleID, the VariationID, and either rs# from dbSNP # or nsv in dbVar. # # AlleleID int value (xpath //Measure/@ID ) # VariationID ID ClinVar uses to anchor default display. # (xpath //MeasureSet/@ID) # rs rs identifier from dbSNP # nsv nsv identifier from dbVar # citation_source The source of the citation, either PubMed, # PubMedCentral, or the NCBI Bookshelf # citation_id The identifier used by that source logger.info("Processing Citations for variants") line_counter = 0 myfile = \ '/'.join((self.rawdir, self.files['variant_citations']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) with open(myfile, 'r', encoding="utf8") as f: filereader = csv.reader(f, delimiter='\t', quotechar='\"') for line in filereader: # skip comments line = line if re.match(r'^#', line[0]): continue (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line line_counter += 1 if self.testMode: if int(variant_num) not in self.variant_ids: continue if citation_id.strip() == '': logger.info( "Skipping blank citation for ClinVarVariant:%s", str(variant_num)) continue # the citation for a variant is made to some kind of # combination of the ids here. # but i'm not sure which, we don't know what the # citation is for exactly, other than the variant. # so use mentions var_id = 'ClinVarVariant:' + variant_num # citation source: PubMed | PubMedCentral | citation_source # citation id: # format the citation id: ref_id = None if citation_source == 'PubMed': ref_id = 'PMID:' + str(citation_id.replace(" ", "")) model.makeLeader(ref_id) elif citation_source == 'PubMedCentral': ref_id = 'PMCID:' + str(citation_id) if ref_id is not None: r = Reference(self.graph, ref_id, Reference.ref_types['journal_article']) r.addRefToGraph() g.addTriple(ref_id, self.properties['is_about'], var_id) if not self.testMode \ and (limit is not None and line_counter > limit): break logger.info("Finished processing citations for variants") return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ src_key = 'catalog' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) fname = '/'.join((self.rawdir, self.files[src_key]['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = self.globaltt['stem cell'] mouse_taxon = self.globaltt['Mus musculus'] geno = Genotype(graph) with open(fname, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') # First line is header not date/version info. This changed recently, # apparently as of Sep 2019. Also, 3rd line is no longer blank. row = [x.strip() for x in next(reader)] # messy messy col = self.files['catalog']['columns'] strain_missing_allele = [] # to count the ones w/insufficent info if not self.check_fileheader(col, row): pass for row in reader: strain_id = row[col.index('STRAIN/STOCK_ID')].strip() strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')] # strain_type_symbol = row[col.index('STRAIN_TYPE')] strain_state = row[col.index('STATE')] mgi_allele_id = row[col.index( 'MGI_ALLELE_ACCESSION_ID')].strip() mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')] # mgi_allele_name = row[col.index('ALLELE_NAME')] # mutation_type = row[col.index('MUTATION_TYPE')] # chrom = row[col.index('CHROMOSOME')] mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip() mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip() mgi_gene_name = row[col.index('GENE_NAME')] # sds_url = row[col.index('SDS_URL')] # accepted_date = row[col.index('ACCEPTED_DATE')] mpt_ids = row[col.index('MPT_IDS')].strip() pubmed_nums = row[col.index('PUBMED_IDS')].strip() research_areas = row[col.index('RESEARCH_AREAS')].strip() if self.test_mode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # flag bad ones if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '': LOG.error("Erroneous MGI allele id: %s", mgi_allele_id) if mgi_allele_id[:3] == 'MG:': mgi_allele_id = 'MGI:' + mgi_allele_id[3:] else: mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the sequence alteration types # var_type = self.localtt[mutation_type] # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id) # scrub out any spaces, fix known issues mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id == 'NULL': mgi_gene_id = '' elif mgi_gene_id[:7] == 'GeneID:': mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:] if mgi_gene_id != '': try: [curie, localid] = mgi_gene_id.split(':') except ValueError as verror: LOG.warning( "Problem parsing mgi_gene_id %s from file %s: %s", mgi_gene_id, fname, verror) if curie not in ['MGI', 'NCBIGene']: LOG.info("MGI Gene id not recognized: %s", mgi_gene_id) self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - too many. report summary at the end # some things have gene labels, but no identifiers - report if mgi_gene_symbol != '' and mgi_gene_id == '': # LOG.error( # "Gene label with no MGI identifier for strain %s: %s", # strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol) # make a temp id for genes that aren't identified ... err wow. # tmp_gene_id = '_' + mgi_gene_symbol # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mpt_ids are a comma delimited list # labels with MP terms following in brackets phenotype_ids = [] if mpt_ids != '': for lb_mp in mpt_ids.split(r','): lb_mp = lb_mp.strip() if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:': phenotype_ids.append(lb_mp[-11:-2]) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums != '': for pm_num in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + pm_num.strip() pubmed_ids.append(pmid) ref = Reference(graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( # an inst of mouse?? strain_id, strain_label, strain_type, research_areas) model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in some ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid, self.globaltt['has phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: # too chatty here. report aggregate # LOG.info("Phenotypes and no allele for %s", strain_id) strain_missing_allele.append(strain_id) if not self.test_mode and (limit is not None and reader.line_num > limit): break # report misses if strain_missing_allele: LOG.info("Phenotypes and no allele for %i strains", len(strain_missing_allele)) # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if variants: for var in variants: vl_id = var.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, self.globaltt['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC(vslc_id, vl, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) if vslc_list: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = re.sub( r':', '', '-'.join( (self.globaltt['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', self.globaltt['unspecified_genomic_background'], "A placeholder for the unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, self.globaltt['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, self.globaltt['has_variant_part']) geno.addGenotype(genotype_id, genotype_label) graph.addTriple(s, self.globaltt['has_genotype'], genotype_id) else: # LOG.debug( # "Strain %s is not making a proper genotype.", s) pass LOG.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) LOG.error('%i symbols given are missing their gene identifiers', len(genes_with_no_ids)) return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): exit(-1) for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe seperated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple( 'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ src_key = 'catalog' if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) fname = '/'.join((self.rawdir, self.files[src_key]['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = self.globaltt['stem cell'] mouse_taxon = self.globaltt['Mus musculus'] geno = Genotype(graph) with open(fname, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') # This MMRRC catalog data file was generated on YYYY-MM-DD # insert or check date w/dataset line = next(reader) # gen_date = line[-10:] line = next(reader) col = self.files['catalog']['columns'] if col != line: LOG.error( '%s\nExpected Headers:\t%s\nRecived Headers:\t%s\n', src_key, col, line) LOG.info(set(col) - set(line)) line = next(reader) if line != []: LOG.warning('Expected third line to be blank. got "%s" instead', line) for row in reader: strain_id = row[col.index('STRAIN/STOCK_ID')].strip() strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')] # strain_type_symbol = row[col.index('STRAIN_TYPE')] strain_state = row[col.index('STATE')] mgi_allele_id = row[col.index('MGI_ALLELE_ACCESSION_ID')].strip() mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')] # mgi_allele_name = row[col.index('ALLELE_NAME')] # mutation_type = row[col.index('MUTATION_TYPE')] # chrom = row[col.index('CHROMOSOME')] mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip() mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip() mgi_gene_name = row[col.index('GENE_NAME')] # sds_url = row[col.index('SDS_URL')] # accepted_date = row[col.index('ACCEPTED_DATE')] mpt_ids = row[col.index('MPT_IDS')].strip() pubmed_nums = row[col.index('PUBMED_IDS')].strip() research_areas = row[col.index('RESEARCH_AREAS')].strip() if self.test_mode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set()} # flag bad ones if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '': LOG.error("Erroneous MGI allele id: %s", mgi_allele_id) if mgi_allele_id[:3] == 'MG:': mgi_allele_id = 'MGI:' + mgi_allele_id[3:] else: mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the sequence alteration types # var_type = self.localtt[mutation_type] # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id) # scrub out any spaces, fix known issues mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id == 'NULL': mgi_gene_id = '' elif mgi_gene_id[:7] == 'GeneID:': mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:] if mgi_gene_id != '': [curie, localid] = mgi_gene_id.split(':') if curie not in ['MGI', 'NCBIGene']: LOG.info("MGI Gene id not recognized: %s", mgi_gene_id) self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - too many. report summary at the end # some things have gene labels, but no identifiers - report if mgi_gene_symbol != '' and mgi_gene_id == '': # LOG.error( # "Gene label with no MGI identifier for strain %s: %s", # strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol) # make a temp id for genes that aren't identified ... err wow. # tmp_gene_id = '_' + mgi_gene_symbol # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mpt_ids are a comma delimited list # labels with MP terms following in brackets phenotype_ids = [] if mpt_ids != '': for lb_mp in mpt_ids.split(r','): lb_mp = lb_mp.strip() if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:': phenotype_ids.append(lb_mp[-11:-2]) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums != '': for pm_num in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + pm_num.strip() pubmed_ids.append(pmid) ref = Reference(graph, pmid, self.globaltt['journal article']) ref.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( # an inst of mouse?? strain_id, strain_label, strain_type, research_areas) model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in some ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc( graph, self.name, mgi_allele_id, pid, self.globaltt['has phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: LOG.info("Phenotypes and no allele for %s", strain_id) if not self.test_mode and ( limit is not None and reader.line_num > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for var in variants: vl_id = var.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele( vl_id, vl_symbol, self.globaltt['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele( vl_id, vl_symbol, self.globaltt['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:'+gvc_id gvc_label = '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = re.sub( r':', '', '-'.join(( self.globaltt['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', self.globaltt['unspecified_genomic_background'], "A placeholder for the unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, self.globaltt['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, self.globaltt['has_variant_part']) geno.addGenotype(genotype_id, genotype_label) graph.addTriple( s, self.globaltt['has_genotype'], genotype_id) else: # LOG.debug( # "Strain %s is not making a proper genotype.", s) pass LOG.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) LOG.error( '%i symbols given are missing their gene identifiers', len(genes_with_no_ids)) return
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '', None] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): dbxref = dbxref.strip() # de stutter dbxref (prefix, local_id) = dbxref.split(':')[-2:] prefix = prefix.strip() local_id = local_id.strip() # skip some of these based on curie prefix or malformatting if prefix is None or prefix in filter_out or \ local_id is None or local_id == '': continue if prefix in self.localtt: prefix = self.localtt[prefix] if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' elif prefix == 'AnimalQTLdb': LOG.warning('Unknown AnimalQTLdb species %s for %s:%s', taxon, prefix, local_id) # else: # taxon is not in informal species (not unexpected) dbxref_curie = ':'.join((prefix, local_id)) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) # For Ensembl xrefs, don't proceed to equivalent class code # these are more loose xrefs than equivalent identifiers continue if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim omim_num = omim # last "gene" wins (is never > 2) if omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: # OMIM disease/phenotype is not considered a gene at all # no equivilance between ncbigene and omin-nongene # and ncbi is never a human clique leader in any case dbxref_curie = None continue # designate clique leaders and equivalentClass/sameAs triples # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C' and \ dbxref_curie is not None: model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) elif dbxref_curie is not None: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _process_genes(self, limit=None): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass( hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass( hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': g.addTriple( 'PMID:' + str(p.strip()), model.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(g, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') model.addClassToGraph(band_id, None) f.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) f.addSubsequenceOfFeature(chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = { 'variants': set(), 'genes': set() } # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:' + str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:' + i.strip() pubmed_ids.append(pmid) r = Reference(g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: ' + research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc( g, self.name, mgi_allele_id, pid, model.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and (limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene] + '<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl) + 'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:' + gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:' + bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified (' + s + ')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for " + s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts(gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) g.addTriple(s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): pass for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index( 'pubmed_id')].strip() # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple('PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', ''] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): prefix = ':'.join( dbxref.split(':')[:-1]).strip() # restore nonterminal ':' if prefix in self.localtt: prefix = self.localtt[prefix] # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim model.addXref(gene_id, dbxref_curie) omim_num = omim # last wins elif omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: continue # no equivilance between ncbigene and omin-nongene # designate clique leaders # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _get_var_citations(self, limit): # Generated weekly, the first of the week # A tab-delimited report of citations associated with data in ClinVar, # connected to the AlleleID, the VariationID, and either rs# from dbSNP # or nsv in dbVar. # # AlleleID int value (xpath //Measure/@ID ) # VariationID ID ClinVar uses to anchor default display. # (xpath //MeasureSet/@ID) # rs rs identifier from dbSNP # nsv nsv identifier from dbVar # citation_source The source of the citation, either PubMed, # PubMedCentral, or the NCBI Bookshelf # citation_id The identifier used by that source logger.info("Processing Citations for variants") line_counter = 0 myfile = \ '/'.join((self.rawdir, self.files['variant_citations']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) with open(myfile, 'r', encoding="utf8") as f: filereader = csv.reader(f, delimiter='\t', quotechar='\"') for line in filereader: # skip comments line = line if re.match(r'^#', line[0]): continue (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line line_counter += 1 if self.testMode: if int(variant_num) not in self.variant_ids: continue if citation_id.strip() == '': logger.info( "Skipping blank citation for ClinVarVariant:%s", str(variant_num)) continue # the citation for a variant is made to some kind of # combination of the ids here. # but i'm not sure which, we don't know what the # citation is for exactly, other than the variant. # so use mentions var_id = 'ClinVarVariant:'+variant_num # citation source: PubMed | PubMedCentral | citation_source # citation id: # format the citation id: ref_id = None if citation_source == 'PubMed': ref_id = 'PMID:'+str(citation_id.replace(" ", "")) model.makeLeader(ref_id) elif citation_source == 'PubMedCentral': ref_id = 'PMCID:'+str(citation_id) if ref_id is not None: r = Reference( self.graph, ref_id, Reference.ref_types['journal_article']) r.addRefToGraph() g.addTriple( ref_id, self.properties['is_about'], var_id) if not self.testMode \ and (limit is not None and line_counter > limit): break logger.info("Finished processing citations for variants") return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids) \ or mgi_gene_name == 'withdrawn': continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts model.addClassToGraph(mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class model.addIndividualToGraph( strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? model.makeLeader(strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology model.addClassToGraph(pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(g, self.name, mgi_allele_id, pid, model.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph() else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v.strip() vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_:' + re.sub(r':', '', gene) + '-VL' vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) model.addIndividualToGraph( vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r'_|:', '', gvc_id) gvc_id = '_:'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) model.addIndividualToGraph( gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) bkgd_id = '_:'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) g.addTriple( s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return