def map(attr, predicate): cell = getattr(self, attr)() value = cell.value if value: for iri, label in list(self.sheet.convert_cell(value, predicate=predicate)): if ',' not in value and label != value: log.warning(f'label mismatch {label!r} != {value!r}') if iri is None: if label == 'bed nucleus of stria terminalis juxtacapsular nucleus': iri = OntTerm('UBERON:0011173', label='anterior division of bed nuclei of stria terminalis') else: log.debug(f'nothing found for {label}') continue if isinstance(iri, tuple): op, *rest = iri # TODO need combinators in future version for union/intersection of object out = (op, *(NegPhenotype(r, predicate) if isinstance(r, LacksObject) else Phenotype(r, predicate) for r in rest if r is not None)) yield out continue elif isinstance(iri, LacksObject): p = NegPhenotype(iri.asURIRef(), predicate) else: p = Phenotype(iri, predicate) yield p.asIndicator()
def grow(r): log.debug(r) # TODO implement on the object to allow joining on an index? # man this would be easier with sql >_< probably pandas too # but so many dependencies ... also diffing issues etc if r.label is not None: return valuesC.searchIndex('label', r.label)
def replace(r, *cols): """ replace and reorder """ # FIXME _super_ inefficient vrow = grow(r) log.debug('\n'.join(r._fields)) log.debug('\n'.join(str(_) for _ in r)) for field, value in sorted(zip(r._fields, r), key=key): if field in cols: value = getattr(vrow, field) yield '' if value is None else value # completely overwrite the sheet
def mapCell(cls, cell, syns=False): search_prefixes = ( 'UBERON', 'CHEBI', 'PR', 'NCBITaxon', 'NCBIGene', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL', 'BIRNLEX', ) if ':' in cell and ' ' not in cell: log.debug(cell) if 'http' in cell: if cell.startswith('http'): t = OntTerm(iri=cell) else: return None, None # garbage with http inline else: t = OntTerm(cell, exclude_prefix=( 'FMA', )) # FIXME need better error message in ontquery return t.u, t.label result = [ r for r in cls.sgv.findByTerm( cell, searchSynonyms=syns, prefix=search_prefixes) if not r['deprecated'] ] #printD(cell, result) if not result: log.debug(f'{cell}') maybe = list(cls.query(label=cell, exclude_prefix=('FMA', ))) if maybe: t = maybe[0] return t.u, t.label elif not syns: return cls.mapCell(cell, syns=True) else: return None, None elif len(result) > 1: #printD('WARNING', result) result = select_by_curie_rank(result) else: result = result[0] return rdflib.URIRef(result['iri']), result['labels'][0]
def mapCell(cls, cell, syns=False, predicate=None): search_prefixes = ('UBERON', 'CHEBI', 'PR', 'NCBIGene', 'NCBITaxon', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL', 'BIRNLEX',) if predicate and predicate in Phenotype._molecular_predicates: # uberon syns pollute molecular results so move it to one before birnlex ub, *rest, b = search_prefixes search_prefixes = (*rest, ub, b) if cell == 'contralateral': return ilxtr.Contralateral, cell # XXX FIXME only BSPO has this right now elif cell.lower() == 'gaba receptor role': return ilxtr.GABAReceptor, cell if ':' in cell and ' ' not in cell: log.debug(cell) if 'http' in cell: if cell.startswith('http'): t = OntTerm(iri=cell) else: return None, None # garbage with http inline else: t = OntTerm(cell, exclude_prefix=('FMA',)) # FIXME need better error message in ontquery return t.u, t.label if cell in ('Vertebrata', ): # search syns syns = True def rank_mask(r): """ create a boolean array testing if the current entry starts with the prefixes in order and what you will get out is arrays where the nth element is true if the nth prefix is matched which will then be sorted by n 1 0 0 0 0 0 0 1 \\ 1 0 0 0 0 0 0 0 \\ 0 1 0 0 0 0 0 0 \\ 0 0 1 0 0 0 0 0 \\ 0 0 0 1 0 0 0 0 \\ 0 0 0 0 1 0 0 0 \\ """ # why did it take so long to think of this? return ( *(r['curie'].startswith(p) for p in search_prefixes), 'labels' in r and cell in r['labels'], ) result = sorted([r for r in cls.sgv.findByTerm(cell, searchSynonyms=syns, prefix=search_prefixes) if not r['deprecated']], key=rank_mask, reverse=True) #printD(cell, result) if not result: log.debug(f'{cell}') maybe = list(cls.query(label=cell, exclude_prefix=('FMA',))) if maybe: t = maybe[0] return t.u, t.label elif not syns: return cls.mapCell(cell, syns=True, predicate=predicate) else: return None, None elif len(result) > 1: #printD('WARNING', result) result = result[0] #select_by_curie_rank(result) else: result = result[0] return rdflib.URIRef(result['iri']), result['labels'][0]
def loop_internal(j, header, cell): nonlocal id nonlocal current_neuron nonlocal do_release notes = list(process_note(get_note(i + 1, j, self.cells_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = self.convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None return elif header == 'label': if id == OntId('NIFEXT:66').u: breakpoint() label_neuron = cell if cell in self.existing: current_neuron = self.existing[cell] elif cell: # TODO self.new.append(cell) else: raise ValueError(cell) # wat return elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass return elif header == 'PMID': # TODO return elif header == 'Other reference': # TODO return elif header == 'Other label': # TODO return elif header == 'definition': return # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] return elif header in self.skip: return objects = [] if cell: predicate = self.convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in self.convert_cell(cell): if predicate in NeuronCUT._molecular_predicates: if isinstance(object, tuple): op, *rest = object rest = [OntTerm(o).asIndicator().URIRef for o in rest] object = op, *rest elif object: log.debug(f'{object!r}') object = OntTerm(object).asIndicator().URIRef if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if self.lower_check(l, cell): _err.append((cell, label)) if _err: self.errors.extend(_err) else: objects.append(object) elif self.lower_check(label, cell): self.errors.append((cell, label)) elif str(id) == object: self.errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: return if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: self.errors.append((object, predicate, cell)) elif objects: self.errors.append((header, objects)) else: self.errors.append((header, cell))
def convert_row(self, i, neuron_row, headers, expect_pes): id = None label_neuron = None definition_neuron = None synonyms_neuron = None current_neuron = None phenotypes = [] do_release = False predicate_notes = {} object_notes = {} other_notes = {} wat = {} def loop_internal(j, header, cell): nonlocal id nonlocal current_neuron nonlocal do_release notes = list(process_note(get_note(i + 1, j, self.cells_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = self.convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None return elif header == 'label': if id == OntId('NIFEXT:66').u: breakpoint() label_neuron = cell if cell in self.existing: current_neuron = self.existing[cell] elif cell: # TODO self.new.append(cell) else: raise ValueError(cell) # wat return elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass return elif header == 'PMID': # TODO return elif header == 'Other reference': # TODO return elif header == 'Other label': # TODO return elif header == 'definition': return # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] return elif header in self.skip: return objects = [] if cell: predicate = self.convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in self.convert_cell(cell): if predicate in NeuronCUT._molecular_predicates: if isinstance(object, tuple): op, *rest = object rest = [OntTerm(o).asIndicator().URIRef for o in rest] object = op, *rest elif object: log.debug(f'{object!r}') object = OntTerm(object).asIndicator().URIRef if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if self.lower_check(l, cell): _err.append((cell, label)) if _err: self.errors.extend(_err) else: objects.append(object) elif self.lower_check(label, cell): self.errors.append((cell, label)) elif str(id) == object: self.errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: return if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: self.errors.append((object, predicate, cell)) elif objects: self.errors.append((header, objects)) else: self.errors.append((header, cell)) # translate header -> predicate # translate cell value to ontology id ######################################### for j, (header, cell) in enumerate(zip(headers, neuron_row)): loop_internal(j, header, cell) if current_neuron and phenotypes: # TODO merge current with changes # or maybe we just replace since all the phenotypes should be there? log.debug(phenotypes) if id is not None: log.debug(f'{(id, bool(id))}') elif label_neuron: id = make_cut_id(label_neuron) if id not in expect_pes: if id is not None: log.error(f'{id!r} not in cuts!?') return phenotypes = sorted(set(phenotypes)) _ep = expect_pes[id] if not allMembers(_ep, *phenotypes) and not neuron_row[self.tomqc_check_ind]: #if expect_pes[id] != len(phenotypes): # FIXME this is not a strict roundtrip, it may also include additions lp = len(phenotypes) lep = len(_ep) if lp == lep: (pprint(sorted(_ep))) (pprint(phenotypes)) pprint(set(_ep) - set(phenotypes)) pprint(set(phenotypes) - set(_ep)) _AAAAAA = id # hack for debugger print(_AAAAAA) log.error(f'{id!r} failed roundtrip {lp} != {lep}') self.failed[id] = phenotypes return neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron, override=bool(id) or bool(label_neuron)) neuron.adopt_meta(current_neuron) # FIXME occasionally this will error?! yield neuron else: return # FIXME this polutes everything ??? """ fn = fixname(label_neuron) if not phenotypes and i: # i skips header self.errors.append((i, neuron_row)) # TODO special review for phenos but not current phenotypes = Phenotype('TEMP:phenotype/' + fn), neuron = NeuronCUT(*phenotypes, id_=make_cut_id(label_neuron), label=label_neuron, override=True) """ ################################################### # update the meta if there were any changes if definition_neuron is not None: neuron.definition = definition_neuron if synonyms_neuron is not None: neuron.synonyms = synonyms_neuron try: neuron.batchAnnotateByObject(object_notes) neuron.batchAnnotate(other_notes) except AttributeError as e: #breakpoint() log.exception(e) #'something very strage has happened\n', e) pass # FIXME FIXME FIXME #neuron.batchAnnotateByPredicate(predicate_notes) # TODO # FIXME doesn't quite work in this context, but there are other # cases where annotations to the general modality are still desireable # FIXME there may be no predicate? if the object fails to match? if do_release: self.release.append(neuron)
def sheet_to_neurons(values, notes_index, expect_pes): # TODO import existing ids to register by label sgv = Vocabulary() e_config = Config('common-usage-types') e_config.load_existing() query = oq.OntQuery(oq.plugin.get('rdflib')(e_config.core_graph), instrumented=OntTerm) # FIXME clear use case for the remaining bound to whatever query produced it rather # than the other way around ... how to support this use case ... existing = {str(n.origLabel):n for n in e_config.neurons()} def convert_header(header): if header.startswith('has'): # FIXME use a closed namespace return ilxtr[header] else: return None def convert_other(header): if header == 'label': return rdfs.label elif header == 'curie': return rdf.type elif header == 'definition': return definition else: header = header.replace(' ', '_') return TEMP[header] # FIXME def mapCell(cell, syns=False): search_prefixes = ('UBERON', 'CHEBI', 'PR', 'NCBITaxon', 'NCBIGene', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL', 'BIRNLEX',) if ':' in cell and ' ' not in cell: log.debug(cell) if 'http' in cell: if cell.startswith('http'): t = OntTerm(iri=cell) else: return None, None # garbage with http inline else: t = OntTerm(cell, exclude_prefix=('FMA',)) # FIXME need better error message in ontquery return t.u, t.label result = [r for r in sgv.findByTerm(cell, searchSynonyms=syns, prefix=search_prefixes) if not r['deprecated']] #printD(cell, result) if not result: log.debug(f'{cell}') maybe = list(query(label=cell, exclude_prefix=('FMA',))) if maybe: qr = maybe[0] return qr.OntTerm.u, qr.label elif not syns: return mapCell(cell, syns=True) else: return None, None elif len(result) > 1: #printD('WARNING', result) result = select_by_curie_rank(result) else: result = result[0] return rdflib.URIRef(result['iri']), result['labels'][0] def lower_check(label, cell): return label not in cell and label.lower() not in cell.lower() # have to handle comma sep case lnlu = {v:k for k, v in LogicalPhenotype.local_names.items()} def convert_cell(cell_or_comma_sep): #printD('CONVERTING', cell_or_comma_sep) for cell_w_junk in cell_or_comma_sep.split(','): # XXX WARNING need a way to alter people to this cell = cell_w_junk.strip() if cell.startswith('(OR') or cell.startswith('(AND'): start, *middle, end = cell.split('" "') OPoperator, first = start.split(' "') operator = OPoperator[1:] operator = lnlu[operator] last, CP = end.rsplit('"') iris, labels = [], [] for term in (first, *middle, last): iri, label = mapCell(term) if label is None: label = cell_or_comma_sep iris.append(iri) labels.append(label) yield (operator, *iris), tuple(labels) else: iri, label = mapCell(cell) if label is None: yield iri, cell_or_comma_sep # FIXME need a way to handle this that doesn't break things? else: yield iri, label config = Config('cut-roundtrip') skip = 'alignment label', headers, *rows = values errors = [] new = [] release = [] for i, neuron_row in enumerate(rows): id = None label_neuron = None definition_neuron = None synonyms_neuron = None current_neuron = None phenotypes = [] do_release = False predicate_notes = {} object_notes = {} other_notes = {} wat = {} for j, (header, cell) in enumerate(zip(headers, neuron_row)): notes = list(process_note(get_note(i + 1, j, notes_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None continue elif header == 'label': label_neuron = cell if cell in existing: current_neuron = existing[cell] elif cell: # TODO new.append(cell) else: raise ValueError(cell) # wat continue elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass continue elif header == 'PMID': # TODO continue elif header == 'Other reference': # TODO continue elif header == 'Other label': # TODO continue elif header == 'definition': continue # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) continue elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] continue elif header in skip: continue objects = [] if cell: predicate = convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in convert_cell(cell): if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if lower_check(l, cell): _err.append((cell, label)) if _err: errors.extend(_err) else: objects.append(object) elif lower_check(label, cell): errors.append((cell, label)) elif str(id) == object: errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: continue if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: errors.append((object, predicate, cell)) elif objects: errors.append((header, objects)) else: errors.append((header, cell)) # translate header -> predicate # translate cell value to ontology id if current_neuron and phenotypes: # TODO merge current with changes # or maybe we just replace since all the phenotypes should be there? log.debug(phenotypes) if id is not None: log.debug(f'{(id, bool(id))}') elif label_neuron: id = make_cut_id(label_neuron) if id not in expect_pes: log.error(f'{id!r} not in cuts!?') continue if expect_pes[id] != len(phenotypes): log.error(f'{id!r} failed roundtrip {len(phenotypes)} != {expect_pes[id]}') continue neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron, override=bool(id) or bool(label_neuron)) neuron.adopt_meta(current_neuron) # FIXME occasionally this will error?! else: continue # FIXME this polutes everything ??? fn = fixname(label_neuron) if not phenotypes and i: # i skips header errors.append((i, neuron_row)) # TODO special review for phenos but not current phenotypes = Phenotype('TEMP:phenotype/' + fn), neuron = NeuronCUT(*phenotypes, id_=make_cut_id(label_neuron), label=label_neuron, override=True) # update the meta if there were any changes if definition_neuron is not None: neuron.definition = definition_neuron if synonyms_neuron is not None: neuron.synonyms = synonyms_neuron try: neuron.batchAnnotateByObject(object_notes) neuron.batchAnnotate(other_notes) except AttributeError as e: #embed() log.exception(e) #'something very strage has happened\n', e) pass # FIXME FIXME FIXME #neuron.batchAnnotateByPredicate(predicate_notes) # TODO # FIXME doesn't quite work in this context, but there are other # cases where annotations to the general modality are still desireable # FIXME there may be no predicate? if the object fails to match? if do_release: release.append(neuron) return config, errors, new, release