def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text model.addDescription(gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:' + gene_num if concise_description != 'none available': model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_gene_desc(self, limit): # currently unsupported src_key = 'gene_desc' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing: %s", self.files[src_key]['file']) graph = self.graph model = Model(graph) col = self.files[src_key]['columns'] with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') row = next(reader) for row in reader: if re.match(r'\#', ''.join(row)): continue gene_num = row[col.index('gene_num')] # public_name = row[col.index('public_name')] # molecular_name = row[col.index('molecular_name')] concise_description = row[col.index('concise_description')] provisional_description = row[col.index( 'provisional_description')] detailed_description = row[col.index('detailed_description')] automated_description = row[col.index('automated_description')] gene_class_description = row[col.index( 'gene_class_description')] gene_id = 'WormBase:' + gene_num if concise_description not in ('none available', '', None): model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text != concise_description and \ text[:4] != 'none' and text != '': text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if limit is not None and reader.line_num > limit: break
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) graph = self.graph model = Model(graph) LOG.info("Processing: %s", self.files['gene_desc']['file']) line_counter = 0 # geno = Genotype(graph) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row gene_id = 'WormBase:' + gene_num if concise_description not in ('none available', '', None): model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text != concise_description and \ text[:4] != 'none' and text != '': text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if limit is not None and line_counter > limit: break
def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ model = Model(self.graph) cnt = 0 books_not_found = set() clin_des_regx = re.compile(r".*Summary.sec0") lit_cite_regex = re.compile(r".*Literature_Cited") pubmed_regex = re.compile(r"pubmed") # ??? for a static string? for nbk in self.book_ids: cnt += 1 nbk_id = 'GeneReviews:' + nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # LOG.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue LOG.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = soup.find('div', id=clin_des_regx) if clin_summary is not None: ptext = clin_summary.find('p').text ptext = re.sub(r'\s+', ' ', ptext) unlst = clin_summary.find('ul') if unlst is not None: item_text = list() for lst_itm in unlst.find_all('li'): item_text.append(re.sub(r'\s+', ' ', lst_itm.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id + ']')) model.addDefinition(nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=lit_cite_regex) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for ref in ref_list: for anchor in ref.find_all('a', attrs={'href': pubmed_regex}): if re.match(r'PubMed:', anchor.text): pmnum = re.sub(r'PubMed:\s*', '', anchor.text) else: pmnum = re.search(r'\/pubmed\/(\d+)$', anchor['href']).group(1) if pmnum is not None: pmid = 'PMID:' + str(pmnum) self.graph.addTriple(pmid, self.globaltt['is_about'], nbk_id) pmid_set.add(pmnum) reference = Reference( self.graph, pmid, self.globaltt['journal article']) reference.addRefToGraph() # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.globaltt['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and cnt > limit: break # finish looping through books bknfd = len(books_not_found) if len(books_not_found) > 0: if bknfd > 100: LOG.warning("There were %d books not found.", bknfd) else: LOG.warning( "The following %d books were not found locally: %s", bknfd, str(books_not_found)) LOG.info("Finished processing %d books for clinical descriptions", cnt - bknfd) return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data . Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Turtle: <eom id> a owl:Class rdf:label Literal(eom label) oboInOwl:has_related_synonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) TEC_note: URL are not literals. :param raw: :param limit: :return: """ src_key = 'tables' model = Model(self.graph) col = self.resources[src_key]['columns'] with open(raw, 'r') as rawread: reader = csv.reader(rawread, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'| # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" morphology_term_id = row[col.index( 'morphology_term_id')].strip() # morphology_term_num = row[col.index('morphology_term_num')] morphology_term_label = row[col.index( 'morphology_term_label')].strip() morphology_term_url = row[col.index( 'morphology_term_url')].strip() # terminology_category_label = row[ # col.index('terminology_category_label')] # terminology_category_url = row[col.index('terminology_category_url')] # subcategory = row[col.index('subcategory')] objective_definition = row[col.index( 'objective_definition')].strip() subjective_definition = row[col.index( 'subjective_definition')].strip() comments = row[col.index('comments')].strip() synonyms = row[col.index('synonyms')].strip() replaces = row[col.index('replaces')].strip() small_figure_url = row[col.index('small_figure_url')].strip() large_figure_url = row[col.index('large_figure_url')].strip() # e_uid = row[col.index('e_uid')] # v_uid = row[col.index('v_uid')] # v_uuid = row[col.index('v_uuid')] # v_lastmodified = row[col.index('v_lastmodified')] # v_status = row[col.index('v_status')] # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')] # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition + '.' definition = ' '.join( (objective_definition, subjective_definition)) model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments) for syn in synonyms.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_exact_synonym']) # morphology_term_id has_related_synonym replaces (; delimited) if replaces not in ['', synonyms]: for syn in replaces.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_related_synonym']) # <morphology_term_id> <foaf:page> morphology_term_url if morphology_term_id is not None: reference = Reference(self.graph, morphology_term_id, self.globaltt['web page']) # TEC 201905: # Not so sure we need explicit <eom_uri> <webpage> <eom_url>. # since <eom_uri> IS the <eom_url>. reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and reader.line_num > limit: break
def _transform_entry(self, e, graph): g = graph model = Model(g) geno = Genotype(graph) tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' build_num = "GRCh38" build_id = "NCBIGenome:"+build_num # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": # "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, # and add it as a synonym abbrev = None if len(re.split(r';', label)) > 1: abbrev = (re.split(r';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': model.addDeprecatedClass(omimid) else: omimtype = self._get_omimtype(e['entry']) nodelabel = newlabel # this uses our cleaned-up label if omimtype == Genotype.genoparts['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker model.addClassToGraph(omimid, nodelabel, None, newlabel) elif omimtype == Genotype.genoparts['gene']: if abbrev is not None: nodelabel = abbrev model.addClassToGraph(omimid, nodelabel, omimtype, newlabel) else: model.addClassToGraph(omimid, newlabel, omimtype) # add the original screaming-caps OMIM label as a synonym model.addSynonym(omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym') # for OMIM, we're adding the description as a definition model.addDefinition(omimid, description) if abbrev is not None: model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym') # if this is a genetic locus (but not sequenced) # then add the chrom loc info # but add it to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) feature_id = None feature_label = None if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] is_gene = False if omimtype == \ Genotype.genoparts['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(e['entry'], g) if len(ncbifeature) == 1: feature_id = 'NCBIGene:'+str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(g, self.name, feature_id, omimid) assoc.add_association_to_graph() elif len(ncbifeature) > 1: logger.info( "Its ambiguous when %s maps to >1 gene id: %s", omimid, str(ncbifeature)) else: # no ncbi feature, make an anonymous one feature_id = self._make_anonymous_feature(str(omimnum)) feature_label = abbrev elif omimtype == Genotype.genoparts['gene']: feature_id = omimid is_gene = True else: # 158900 falls into this category feature_id = self._make_anonymous_feature(str(omimnum)) if abbrev is not None: feature_label = abbrev omimtype = \ Genotype.genoparts[ 'heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship f = Feature(g, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_num, 'CHR') geno.addChromosomeClass( chrom_num, tax_id, tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID(chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance( chrom_num, build_id, build_num, chrom) if omimtype == \ Genotype.genoparts[ 'heritable_phenotypic_marker']: postypes = [Feature.types['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API f.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: f.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: logger.info( "start>end (%d>%d) for %s", fstart, fend, omimid) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_num, 'CHR') model.addClassToGraph(loc, None) f.addSubsequenceOfFeature(loc) f.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features # check if moved, if so, # make it deprecated and # replaced consider class to the other thing(s) # some entries have been moved to multiple other entries and # use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search(r'and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split(r'and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) model.addDeprecatedClass(omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_mapped_gene_ids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) # temp gag return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with genes") return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ model = Model(self.graph) cnt = 0 books_not_found = set() clin_des_regx = re.compile(r".*Summary.sec0") lit_cite_regex = re.compile(r".*Literature_Cited") pubmed_regex = re.compile(r"pubmed") # ??? for a static string? for nbk in self.book_ids: cnt += 1 nbk_id = 'GeneReviews:'+nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # LOG.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue LOG.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = soup.find('div', id=clin_des_regx) if clin_summary is not None: ptext = clin_summary.find('p').text ptext = re.sub(r'\s+', ' ', ptext) unlst = clin_summary.find('ul') if unlst is not None: item_text = list() for lst_itm in unlst.find_all('li'): item_text.append(re.sub(r'\s+', ' ', lst_itm.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = ' '.join(( ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id + ']')) model.addDefinition(nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=lit_cite_regex) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for ref in ref_list: for anchor in ref.find_all( 'a', attrs={'href': pubmed_regex}): if re.match(r'PubMed:', anchor.text): pmnum = re.sub(r'PubMed:\s*', '', anchor.text) else: pmnum = re.search( r'\/pubmed\/(\d+)$', anchor['href']).group(1) if pmnum is not None: pmid = 'PMID:'+str(pmnum) self.graph.addTriple( pmid, self.globaltt['is_about'], nbk_id) pmid_set.add(pmnum) reference = Reference( self.graph, pmid, self.globaltt['journal article']) reference.addRefToGraph() # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.globaltt['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and cnt > limit: break # finish looping through books bknfd = len(books_not_found) if len(books_not_found) > 0: if bknfd > 100: LOG.warning("There were %d books not found.", bknfd) else: LOG.warning( "The following %d books were not found locally: %s", bknfd, str(books_not_found)) LOG.info("Finished processing %d books for clinical descriptions", cnt - bknfd)
def _process_genes(self, limit=None): """ This method processes the KEGG gene IDs. The label for the gene is pulled as the first symbol in the list of gene symbols; the rest are added as synonyms. The long-form of the gene name is added as a definition. This is hardcoded to just processes human genes. Triples created: <gene_id> is a SO:gene <gene_id> rdfs:label <gene_name> :param limit: :return: """ LOG.info("Processing genes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) family = Family(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['hsa_genes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (gene_id, gene_name) = row gene_id = 'KEGG-'+gene_id.strip() # the gene listing has a bunch of labels # that are delimited, as: # DST, BP240, BPA, BPAG1, CATX-15, CATX15, D6S1101, DMH, DT, # EBSB2, HSAN6, MACF2; dystonin; K10382 dystonin # it looks like the list is semicolon delimited # (symbol, name, gene_class) # where the symbol is a comma-delimited list # here, we split them up. # we will take the first abbreviation and make it the symbol # then take the rest as synonyms gene_stuff = re.split('r;', gene_name) symbollist = re.split(r',', gene_stuff[0]) first_symbol = symbollist[0].strip() if gene_id not in self.label_hash: self.label_hash[gene_id] = first_symbol if self.test_mode and gene_id not in self.test_ids['genes']: continue # Add the gene as a class. geno.addGene(gene_id, first_symbol) # add the long name as the description if len(gene_stuff) > 1: description = gene_stuff[1].strip() model.addDefinition(gene_id, description) # add the rest of the symbols as synonyms for i in enumerate(symbollist, start=1): model.addSynonym(gene_id, i[1].strip()) if len(gene_stuff) > 2: ko_part = gene_stuff[2] ko_match = re.search(r'K\d+', ko_part) if ko_match is not None and len(ko_match.groups()) == 1: ko = 'KEGG-ko:'+ko_match.group(1) family.addMemberOf(gene_id, ko) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with genes")