def key(field_value): field, value = field_value try: return 0, valuesC.header._fields.index(field) # TODO warn on field mismatch except ValueError as e: log.error(f'{field} {value}') return 1, 0
def main(): cs = [c() for c in sheet_classes] trips = [[cl] + [ c.value for c in (r.neuron_id(), r.exact_location(), r.location_id()) ] for cl in cs for r in cl.rows() if r.row_index > 0 and r.neuron_id().value and ( not hasattr(r, 'exclude') or not r.exclude().value)] dd = defaultdict(list) for c, _s, _p, _o in trips: for x in (_s, _p, _o): if re.match(x, r'(^[\s]+[^\s].*|.*[^\s][\s]+$)'): msg = 'leading/trailing whitespace in {c} {s} {p} {o}' log.error(msg) raise ValueError(msg) s = OntId(_s) try: p = map_predicates(_p) except KeyError as e: log.error(f'sigh {s}') raise e o = OntId(_o) try: dd[s].append(Phenotype(o, p)) except TypeError as e: log.error(f'bad data for {c} {s} {p} {o}') raise e config = Config('apinat-simple-sheet') nrns = [] for id, phenos in dd.items(): n = NeuronApinatSimple(*phenos, id_=id) nrns.append(n) config.write() labels = (ilxtr.genLabel, ilxtr.localLabel, ilxtr.simpleLabel, ilxtr.simpleLocalLabel, rdfs.label, skos.prefLabel) to_remove = [t for t in config._written_graph if t[1] in labels] [config._written_graph.remove(t) for t in to_remove] config._written_graph.write() config.write_python() return config,
def convert_row(self, i, neuron_row, headers, expect_pes): id = None label_neuron = None definition_neuron = None synonyms_neuron = None current_neuron = None phenotypes = [] do_release = False predicate_notes = {} object_notes = {} other_notes = {} wat = {} def loop_internal(j, header, cell): nonlocal id nonlocal current_neuron nonlocal do_release notes = list(process_note(get_note(i + 1, j, self.cells_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = self.convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None return elif header == 'label': if id == OntId('NIFEXT:66').u: breakpoint() label_neuron = cell if cell in self.existing: current_neuron = self.existing[cell] elif cell: # TODO self.new.append(cell) else: raise ValueError(cell) # wat return elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass return elif header == 'PMID': # TODO return elif header == 'Other reference': # TODO return elif header == 'Other label': # TODO return elif header == 'definition': return # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] return elif header in self.skip: return objects = [] if cell: predicate = self.convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in self.convert_cell(cell): if predicate in NeuronCUT._molecular_predicates: if isinstance(object, tuple): op, *rest = object rest = [OntTerm(o).asIndicator().URIRef for o in rest] object = op, *rest elif object: log.debug(f'{object!r}') object = OntTerm(object).asIndicator().URIRef if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if self.lower_check(l, cell): _err.append((cell, label)) if _err: self.errors.extend(_err) else: objects.append(object) elif self.lower_check(label, cell): self.errors.append((cell, label)) elif str(id) == object: self.errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: return if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: self.errors.append((object, predicate, cell)) elif objects: self.errors.append((header, objects)) else: self.errors.append((header, cell)) # translate header -> predicate # translate cell value to ontology id ######################################### for j, (header, cell) in enumerate(zip(headers, neuron_row)): loop_internal(j, header, cell) if current_neuron and phenotypes: # TODO merge current with changes # or maybe we just replace since all the phenotypes should be there? log.debug(phenotypes) if id is not None: log.debug(f'{(id, bool(id))}') elif label_neuron: id = make_cut_id(label_neuron) if id not in expect_pes: if id is not None: log.error(f'{id!r} not in cuts!?') return phenotypes = sorted(set(phenotypes)) _ep = expect_pes[id] if not allMembers(_ep, *phenotypes) and not neuron_row[self.tomqc_check_ind]: #if expect_pes[id] != len(phenotypes): # FIXME this is not a strict roundtrip, it may also include additions lp = len(phenotypes) lep = len(_ep) if lp == lep: (pprint(sorted(_ep))) (pprint(phenotypes)) pprint(set(_ep) - set(phenotypes)) pprint(set(phenotypes) - set(_ep)) _AAAAAA = id # hack for debugger print(_AAAAAA) log.error(f'{id!r} failed roundtrip {lp} != {lep}') self.failed[id] = phenotypes return neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron, override=bool(id) or bool(label_neuron)) neuron.adopt_meta(current_neuron) # FIXME occasionally this will error?! yield neuron else: return # FIXME this polutes everything ??? """ fn = fixname(label_neuron) if not phenotypes and i: # i skips header self.errors.append((i, neuron_row)) # TODO special review for phenos but not current phenotypes = Phenotype('TEMP:phenotype/' + fn), neuron = NeuronCUT(*phenotypes, id_=make_cut_id(label_neuron), label=label_neuron, override=True) """ ################################################### # update the meta if there were any changes if definition_neuron is not None: neuron.definition = definition_neuron if synonyms_neuron is not None: neuron.synonyms = synonyms_neuron try: neuron.batchAnnotateByObject(object_notes) neuron.batchAnnotate(other_notes) except AttributeError as e: #breakpoint() log.exception(e) #'something very strage has happened\n', e) pass # FIXME FIXME FIXME #neuron.batchAnnotateByPredicate(predicate_notes) # TODO # FIXME doesn't quite work in this context, but there are other # cases where annotations to the general modality are still desireable # FIXME there may be no predicate? if the object fails to match? if do_release: self.release.append(neuron)
def sheet_to_neurons(values, notes_index, expect_pes): # TODO import existing ids to register by label sgv = Vocabulary() e_config = Config('common-usage-types') e_config.load_existing() query = oq.OntQuery(oq.plugin.get('rdflib')(e_config.core_graph), instrumented=OntTerm) # FIXME clear use case for the remaining bound to whatever query produced it rather # than the other way around ... how to support this use case ... existing = {str(n.origLabel):n for n in e_config.neurons()} def convert_header(header): if header.startswith('has'): # FIXME use a closed namespace return ilxtr[header] else: return None def convert_other(header): if header == 'label': return rdfs.label elif header == 'curie': return rdf.type elif header == 'definition': return definition else: header = header.replace(' ', '_') return TEMP[header] # FIXME def mapCell(cell, syns=False): search_prefixes = ('UBERON', 'CHEBI', 'PR', 'NCBITaxon', 'NCBIGene', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL', 'BIRNLEX',) if ':' in cell and ' ' not in cell: log.debug(cell) if 'http' in cell: if cell.startswith('http'): t = OntTerm(iri=cell) else: return None, None # garbage with http inline else: t = OntTerm(cell, exclude_prefix=('FMA',)) # FIXME need better error message in ontquery return t.u, t.label result = [r for r in sgv.findByTerm(cell, searchSynonyms=syns, prefix=search_prefixes) if not r['deprecated']] #printD(cell, result) if not result: log.debug(f'{cell}') maybe = list(query(label=cell, exclude_prefix=('FMA',))) if maybe: qr = maybe[0] return qr.OntTerm.u, qr.label elif not syns: return mapCell(cell, syns=True) else: return None, None elif len(result) > 1: #printD('WARNING', result) result = select_by_curie_rank(result) else: result = result[0] return rdflib.URIRef(result['iri']), result['labels'][0] def lower_check(label, cell): return label not in cell and label.lower() not in cell.lower() # have to handle comma sep case lnlu = {v:k for k, v in LogicalPhenotype.local_names.items()} def convert_cell(cell_or_comma_sep): #printD('CONVERTING', cell_or_comma_sep) for cell_w_junk in cell_or_comma_sep.split(','): # XXX WARNING need a way to alter people to this cell = cell_w_junk.strip() if cell.startswith('(OR') or cell.startswith('(AND'): start, *middle, end = cell.split('" "') OPoperator, first = start.split(' "') operator = OPoperator[1:] operator = lnlu[operator] last, CP = end.rsplit('"') iris, labels = [], [] for term in (first, *middle, last): iri, label = mapCell(term) if label is None: label = cell_or_comma_sep iris.append(iri) labels.append(label) yield (operator, *iris), tuple(labels) else: iri, label = mapCell(cell) if label is None: yield iri, cell_or_comma_sep # FIXME need a way to handle this that doesn't break things? else: yield iri, label config = Config('cut-roundtrip') skip = 'alignment label', headers, *rows = values errors = [] new = [] release = [] for i, neuron_row in enumerate(rows): id = None label_neuron = None definition_neuron = None synonyms_neuron = None current_neuron = None phenotypes = [] do_release = False predicate_notes = {} object_notes = {} other_notes = {} wat = {} for j, (header, cell) in enumerate(zip(headers, neuron_row)): notes = list(process_note(get_note(i + 1, j, notes_index))) # + 1 since headers is removed if notes and not header.startswith('has'): _predicate = convert_other(header) if cell: _object = rdflib.Literal(cell) # FIXME curies etc. else: _object = rdf.nil other_notes[_predicate, _object] = notes if header == 'curie': id = OntId(cell).u if cell else None continue elif header == 'label': label_neuron = cell if cell in existing: current_neuron = existing[cell] elif cell: # TODO new.append(cell) else: raise ValueError(cell) # wat continue elif header == 'Status': # TODO if cell == 'Yes': do_release = True elif cell == 'Maybe': pass elif cell == 'Not yet': pass elif cell == 'Delete': pass else: pass continue elif header == 'PMID': # TODO continue elif header == 'Other reference': # TODO continue elif header == 'Other label': # TODO continue elif header == 'definition': continue # FIXME single space differences between the spreadsheet and the source if cell: definition_neuron = rdflib.Literal(cell) continue elif header == 'synonyms': if cell: synonyms_neuron = [rdflib.Literal(s.strip()) # FIXME bare comma is extremely dangerous for s in cell.split(',')] continue elif header in skip: continue objects = [] if cell: predicate = convert_header(header) if predicate is None: log.debug(f'{(header, cell, notes)}') for object, label in convert_cell(cell): if isinstance(label, tuple): # LogicalPhenotype case _err = [] for l in label: if lower_check(l, cell): _err.append((cell, label)) if _err: errors.extend(_err) else: objects.append(object) elif lower_check(label, cell): errors.append((cell, label)) elif str(id) == object: errors.append((header, cell, object, label)) object = None else: objects.append(object) if notes: # FIXME this is a hack to only attach to the last value # since we can't distinguish at the moment wat[predicate, object] = notes if object is not None: # object aka iri can be none if we don't find anything object_notes[object] = notes else: predicate_notes[predicate] = notes # FIXME it might also be simpler in some cases # to have this be object_notes[object] = notes # because we are much less likely to have the same # phenotype appear attached to the different dimensions # FIXME comma sep is weak here because the # reference is technically ambiguous # might be an argument for the denormalized form ... # or perhaps having another sheet for cases like that else: continue if predicate and objects: for object in objects: # FIXME has layer location phenotype if isinstance(object, tuple): op, *rest = object pes = (Phenotype(r, predicate) for r in rest) # FIXME nonhomogenous phenotypes phenotypes.append(LogicalPhenotype(op, *pes)) elif object: phenotypes.append(Phenotype(object, predicate)) else: errors.append((object, predicate, cell)) elif objects: errors.append((header, objects)) else: errors.append((header, cell)) # translate header -> predicate # translate cell value to ontology id if current_neuron and phenotypes: # TODO merge current with changes # or maybe we just replace since all the phenotypes should be there? log.debug(phenotypes) if id is not None: log.debug(f'{(id, bool(id))}') elif label_neuron: id = make_cut_id(label_neuron) if id not in expect_pes: log.error(f'{id!r} not in cuts!?') continue if expect_pes[id] != len(phenotypes): log.error(f'{id!r} failed roundtrip {len(phenotypes)} != {expect_pes[id]}') continue neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron, override=bool(id) or bool(label_neuron)) neuron.adopt_meta(current_neuron) # FIXME occasionally this will error?! else: continue # FIXME this polutes everything ??? fn = fixname(label_neuron) if not phenotypes and i: # i skips header errors.append((i, neuron_row)) # TODO special review for phenos but not current phenotypes = Phenotype('TEMP:phenotype/' + fn), neuron = NeuronCUT(*phenotypes, id_=make_cut_id(label_neuron), label=label_neuron, override=True) # update the meta if there were any changes if definition_neuron is not None: neuron.definition = definition_neuron if synonyms_neuron is not None: neuron.synonyms = synonyms_neuron try: neuron.batchAnnotateByObject(object_notes) neuron.batchAnnotate(other_notes) except AttributeError as e: #embed() log.exception(e) #'something very strage has happened\n', e) pass # FIXME FIXME FIXME #neuron.batchAnnotateByPredicate(predicate_notes) # TODO # FIXME doesn't quite work in this context, but there are other # cases where annotations to the general modality are still desireable # FIXME there may be no predicate? if the object fails to match? if do_release: release.append(neuron) return config, errors, new, release