def __init__(self, path): super().__init__() self.path = path tabular = Tabular(self.path) self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys) self.t = tabular l = list(tabular) if not l: # FIXME bad design, this try block is a workaround for bad handling of empty lists raise exc.NoDataError(self.path) self.orig_header, *rest = l header = Header(self.orig_header).data self.fail = False if self.to_index: for head in self.to_index: if head not in header: log.error(f'\'{self.t.path}\' malformed header!') self.fail = True if self.fail: self.bc = byCol(rest, header) else: self.bc = byCol(rest, header, to_index=self.to_index)
def main(): #from neurondm.models.cuts import main as cuts_main #cuts_config, *_ = cuts_main() from IPython import embed from neurondm.compiled.common_usage_types import config as cuts_config cuts_neurons = cuts_config.neurons() expect_pes = {n.id_:len(n.pes) for n in cuts_neurons} sheet = CutsV1() config, errors, new, release = sheet_to_neurons(sheet.values, sheet.notes_index, expect_pes) #sheet.show_notes() config.write_python() config.write() #config = Config(config.name) #config.load_existing() # FIXME this is a hack to get get a load_graph from neurondm import Config, NeuronCUT release_config = Config('cut-release') [NeuronCUT(*n, id_=n.id_, label=n.origLabel, override=True).adopt_meta(n) for n in release] release_config.write_python() release_config.write() from neurondm.models.cuts import export_for_review review_rows = export_for_review(config, [], [], [], filename='cut-rt-test.csv', with_curies=True) from pyontutils.utils import byCol valuesC = byCol(sheet.values[1:], header=[v.replace(' ', '_') for v in sheet.values[0]], to_index=['label']) reviewC = byCol(review_rows[1:], header=[v.replace(' ', '_') for v in review_rows[0]], to_index=['label']) def grow(r): log.debug(r) # TODO implement on the object to allow joining on an index? # man this would be easier with sql >_< probably pandas too # but so many dependencies ... also diffing issues etc return valuesC.searchIndex('label', r.label) def key(field_value): field, value = field_value try: return valuesC.header._fields.index(field) # TODO warn on field mismatch except ValueError as e: print('ERROR!!!!!!!!!!!', field, value) return None def replace(r, *cols): """ replace and reorder """ # FIXME _super_ inefficient vrow = grow(r) for field, value in sorted(zip(r._fields, r), key=key): if field in cols: value = getattr(vrow, field) yield '' if value is None else value # completely overwrite the sheet rows = [list(replace(r, 'Status', 'definition', 'synonyms', 'PMID')) for r in reviewC] #resp = update_sheet_values('neurons-cut', 'Roundtrip', rows) embed()
def fetch(self, fetch_grid=None, filter_cell=None): """ update remote values (called automatically at __init__) """ #self.__class__.fetch_count += 1 #log.debug(f'fetch count: {self.__class__.fetch_count}') self._stash_uncommitted() if fetch_grid is None: fetch_grid = self.fetch_grid self.metadata() values, grid, cells_index = get_sheet_values( self.name, self.sheet_name, spreadsheet_service=self._spreadsheet_service, fetch_grid=fetch_grid, filter_cell=filter_cell, SPREADSHEET_ID=self._sheet_id()) self.raw_values = values self._values = [list(r) for r in zip(*itertools.zip_longest(*self.raw_values, fillvalue=''))] try: self.byCol = byCol(self.values, to_index=self.index_columns) except ValueError as e: log.error(e) log.warning('Sheet has malformed header, not setting byCol') except IndexError as e: log.error(e) log.warning('Sheet has no header, not setting byCol') self.grid = grid self.cells_index = cells_index self._reapply_uncommitted()
def __init__(self, path): super().__init__() self.path = path if self._is_json: with open(self.path, 'rt') as f: try: self._data_raw = json.load(f) except json.decoder.JSONDecodeError as e: if not f.buffer.tell(): raise exc.NoDataError(self.path) else: raise exc.BadDataError(self.path) from e if isinstance(self._data_raw, dict): # FIXME this breaks downstream assumptions self._data_cache = {self.rename_key(k):tos(self.normalize(k, v)) # FIXME FIXME for k, v in self._data_raw.items()} return tabular = Tabular(self.path) self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys) self.t = tabular l = list(tabular) if not l: # FIXME bad design, this try block is a workaround for bad handling of empty lists raise exc.NoDataError(self.path) self.orig_header, *rest = l header = Header(self.orig_header).data self.fail = False if self.to_index: for head in self.to_index: if head not in header: log.error(f'\'{self.t.path}\' malformed header!') self.fail = True if self.fail: try: self.bc = byCol(rest, header) except ValueError as e: raise exc.BadDataError(self.path) from e else: self.bc = byCol(rest, header, to_index=self.to_index)
def fetch(self, fetch_grid=None): """ update remote values (called automatically at __init__) """ if fetch_grid is None: fetch_grid = self.fetch_grid values, grid, notes_index = get_sheet_values(self.name, self.sheet_name, spreadsheet_service=self._spreadsheet_service, fetch_grid=fetch_grid) self.raw_values = values self.values = [list(r) for r in zip(*itertools.zip_longest(*self.raw_values, fillvalue=''))] try: self.byCol = byCol(self.values, to_index=self.index_columns) except ValueError as e: log.warning('Sheet has malformed header, not setting byCol') self.grid = grid self.notes_index = notes_index
def main(): branch=auth.get('neurons-branch') remote = OntId('NIFTTL:') if branch == 'master' else OntId(f'NIFRAW:{branch}/') ont_config = ontneurons(remote) ont_neurons = ont_config.neurons() bn_config = Config('basic-neurons', # FIXME this should probably be pulled in automatically # from the import statements, and it doesn't work even as is # also a chicken and an egg problem here imports=[remote.iri + 'ttl/generated/swanson.ttl']) #RDFL = oq.plugin.get('rdflib') # FIXME ick #rdfl = RDFL(bn_config.core_graph, OntId) #OntTerm.query.ladd(rdfl) # FIXME ick bn_config.load_existing() bn_neurons = bn_config.neurons() #OntTerm.query._services = OntTerm.query._services[:-1] # FIXME ick ndl_config = Config('neuron_data_lifted') ndl_config.load_existing() # FIXME this is extremely slow ndl_neurons = sorted(ndl_config.neurons()) resources = auth.get_path('resources') cutcsv = resources / 'cut-development.csv' with open(cutcsv.as_posix(), 'rt') as f: rows = [l for l in csv.reader(f)] bc = byCol(rows) (_, *labels), *_ = zip(*bc) labels_set0 = set(labels) ns = [] skipped = [] bamscok = (NIFSTD.BAMSC1125,) for n in (ont_neurons + ndl_neurons): if n.id_ and 'BAMSC' in n.id_: if n.id_ not in bamscok: skipped.append(n) continue l = str(n.origLabel) if l is not None: for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in labels: n._origLabel = l ns.append(n) ns = sorted(ns) sns = set(n.origLabel for n in ns) labels_set1 = labels_set0 - sns agen = [c.label for c in bc if c.autogenerated] sagen = set(agen) added = [c.label for c in bc if c.added] sadded = set(added) ans = [] sans = set() missed = set() _bl = [] # XXX NOTE THE CONTINUE BELOW for n in bn_neurons: continue # we actually get all of these with uberon, will map between them later # can't use capitalize here because there are proper names that stay uppercase l = n.label.replace('(swannt) ', '').replace('Intrinsic', 'intrinsic').replace('Projection', 'projection') for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in agen: n._origLabel = l ans.append(n) sans.add(l) else: missed.add(l) _bl.append(l) agen_missing = sagen - sans labels_set2 = labels_set1 - sans nlx_labels = [c.label for c in bc if c.neurolex] snlx_labels = set(nlx_labels) class SourceCUT(resSource): sourceFile = 'nifstd/resources/cut-development.csv' # FIXME relative to git workingdir... source_original = True sources = SourceCUT(), swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/')) SWAN = interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/') SWAA = interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/') config = Config('cut-development-raw', sources=sources, source_file=relative_path(__file__), prefixes={'swanr': swanr, 'SWAN': SWAN, 'SWAA': SWAA,}) ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns] ians = [None] * len(ans) with NeuronCUT(CUT.Mammalia): mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n) for i, n in zip(ins + ians, ns + ans)] smatch, rem = get_smatch(labels_set2) labels_set3 = labels_set2 - smatch added_unmapped = sadded & labels_set3 # TODO preserve the names from neuronlex on import ... Neuron.write() Neuron.write_python() raw_neurons = config.neurons() # do this before creating the new config # even though we are in theory tripling number of neurons in the current config graph # it won't show up in the next config (and this is why we need to reengineer) raw_neurons_ind_undep = [n.asUndeprecated().asIndicator() for n in raw_neurons] config = Config('cut-development', sources=sources, source_file=relative_path(__file__), prefixes={'swanr': swanr, 'SWAN': SWAN, 'SWAA': SWAA,}) # FIXME the call to asUndprecated currenlty triggers addition # to the current config and output graph as a side effect (ick!) ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons] assert len(ids_updated_neurons) == len(raw_neurons) Neuron.write() Neuron.write_python() progress = (len(labels_set0), len(sns), len(sans), len(smatch), len(labels_set1), len(labels_set2), len(labels_set3)) prog_report = ('\nProgress:\n' f'total: {progress[0]}\n' f'from nlx: {progress[1]}\n' f'from basic: {progress[2]}\n' f'from match: {progress[3]}\n' f'TODO after nlx: {progress[4]}\n' f'TODO after basic: {progress[5]}\n' f'TODO after match: {progress[6]}\n') print(prog_report) assert progress[0] == progress[1] + progress[4], 'neurolex does not add up' assert progress[4] == progress[2] + progress[5], 'basic does not add up' lnlx = set(n.lower() for n in snlx_labels) sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons) # FIXME load origLabel nlx_review = lnlx - sos nlx_missing = sorted(nlx_review) print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):') _ = [print(l) for l in nlx_missing] partial = {k:v for k, v in rem.items() if v and v not in terminals} print(f'\nPartially mapped (n = {len(partial)}):') if partial: mk = max((len(k) for k in partial.keys())) + 2 for k, v in sorted(partial.items()): print(f'{k:<{mk}} {v!r}') #print(f'{k!r:<{mk}}{v!r}') #pprint(partial, width=200) unmapped = sorted(labels_set3) print(f'\nUnmapped (n = {len(labels_set3)}):') _ = [print(l) for l in unmapped] no_location = [n for n in Neuron.neurons() if noneMembers((ilxtr.hasSomaLocatedIn, ilxtr.hasSomaLocatedInLayer), *n.unique_predicates)] if __name__ == '__main__': review_rows = export_for_review(config, unmapped, partial, nlx_missing) breakpoint() return config, unmapped, partial, nlx_missing
def main(): #from neurondm.models.cuts import main as cuts_main #cuts_config, *_ = cuts_main() from neurondm.compiled.common_usage_types import config as cuts_config cuts_neurons = cuts_config.neurons() expect_pes = {n.id_:n.pes for n in cuts_neurons} sheet = CutsV1() _neurons = list(sheet.neurons(expect_pes)) config = sheet.config errors = sheet.errors new = sheet.new release = sheet.release #sheet.show_notes() config.write_python() config.write() #config = Config(config.name) #config.load_existing() # FIXME this is a hack to get get a load_graph # FIXME we need this because _bagExisting doesn't deal with unionOf right now def trything(f): @wraps(f) def inner(*args, **kwargs): try: return f(*args, **kwargs) except: pass return inner from neurondm import Config, NeuronCUT failed_config = Config('cut-failed') [trything(NeuronCUT)(*pes, id_=id_) for id_, pes in sheet.failed.items()] failed_config.write_python() failed_config.write() release_config = Config('cut-release') [NeuronCUT(*n, id_=n.id_, label=n.origLabel, override=True).adopt_meta(n) for n in release] release_config.write_python() release_config.write() from neurondm.models.cuts import export_for_review review_rows = export_for_review(config, [], [], [], filename='cut-rt-test.csv', with_curies=True) from pyontutils.utils import byCol valuesC = byCol(sheet.values[1:], header=[v.replace(' ', '_') for v in sheet.values[0]], to_index=['label']) reviewC = byCol(review_rows[1:], header=[v.replace(' ', '_') for v in review_rows[0]], to_index=['label']) def grow(r): log.debug(r) # TODO implement on the object to allow joining on an index? # man this would be easier with sql >_< probably pandas too # but so many dependencies ... also diffing issues etc if r.label is not None: return valuesC.searchIndex('label', r.label) def key(field_value): field, value = field_value try: return 0, valuesC.header._fields.index(field) # TODO warn on field mismatch except ValueError as e: log.error(f'{field} {value}') return 1, 0 def replace(r, *cols): """ replace and reorder """ # FIXME _super_ inefficient vrow = grow(r) log.debug('\n'.join(r._fields)) log.debug('\n'.join(str(_) for _ in r)) for field, value in sorted(zip(r._fields, r), key=key): if field in cols: value = getattr(vrow, field) yield '' if value is None else value # completely overwrite the sheet breakpoint() rows = [list(replace(r, 'Status', 'definition', 'synonyms', 'PMID')) for r in reviewC] #resp = update_sheet_values('neurons-cut', 'Roundtrip', rows) if __name__ == '__main__': breakpoint()
def main(): ndl_config = Config('neuron_data_lifted') ndl_config.load_existing() ndl_neurons = ndl_config.neurons() bn_config = Config('basic-neurons') bn_config.load_existing() bn_neurons = bn_config.neurons() resources = Path(devconfig.resources) cutcsv = resources / 'common-usage-types.csv' with open(cutcsv.as_posix(), 'rt') as f: rows = [l for l in csv.reader(f)] bc = byCol(rows) (_, *labels), *_ = zip(*bc) labels_set0 = set(labels) ns = [] for n in ndl_neurons: l = str(n.origLabel) if l is not None: for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in labels: n._origLabel = l ns.append(n) sns = set(n.origLabel for n in ns) labels_set1 = labels_set0 - sns agen = [c.label for c in bc if c.autogenerated] sagen = set(agen) added = [c.label for c in bc if c.added] sadded = set(added) ans = [] sans = set() missed = set() for n in bn_neurons: continue # we actually get all of these with uberon, will map between them later # can't use capitalize here because there are proper names that stay uppercase l = n.label.replace('(swannt) ', '').replace('Intrinsic', 'intrinsic').replace('Projection', 'projection') for replace, match in rename_rules.items(): # HEH l = l.replace(match, replace) if l in agen: n._origLabel = l ans.append(n) sans.add(l) else: missed.add(l) agen_missing = sagen - sans labels_set2 = labels_set1 - sans nlx_labels = [c.label for c in bc if c.neurolex] snlx_labels = set(nlx_labels) class SourceCUT(resSource): sourceFile = 'nifstd/resources/common-usage-types.csv' # FIXME relative to git workingdir... source_original = True sources = SourceCUT(), swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/')) config = Config('common-usage-types-raw', sources=sources, source_file=relative_path(__file__), prefixes={'swanr':swanr, 'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'), 'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),}) ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns] ians = [None] * len(ans) def zap(pes): for pe in pes: if pe not in (Phenotype('BIRNLEX:212', ilxtr.hasTaxonRank), Phenotype('NCBITaxon:7742', ilxtr.hasTaxonRank), Phenotype('BIRNLEX:252', ilxtr.hasTaxonRank), Phenotype('BIRNLEX:516', ilxtr.hasTaxonRank),): yield pe with Neuron(CUT.Mammalia): mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n) for i, n in zip(ins + ians, ns + ans)] contains_rules = make_contains_rules() skip = set() smatch = set() rem = {} for l in labels_set2: pes = tuple() l_rem = l for match, pheno in contains_rules.items(): t = None if match not in skip and pheno == OntTerm: try: t = OntTerm(term=match) print('WTF', match, t) if t.validated: pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) else: pheno = None except oq.exceptions.NotFoundError: skip.add(match) pheno = None if match in skip and pheno == OntTerm: pheno = None if match in l_rem and pheno: l_rem = l_rem.replace(match, '').strip() pes += (pheno,) if l_rem in exact_rules: pes += (exact_rules[l_rem],) l_rem = '' if l_rem == ' neuron': l_rem = '' elif l_rem.endswith(' cell'): l_rem = l_rem[:-len(' cell')] #print('l_rem no cell:', l_rem) elif l_rem.endswith(' neuron'): l_rem = l_rem[:-len(' neuron')] #print('l_rem no neuron:', l_rem) hrm = [pe for pe in pes if pe.e == ilxtr.hasSomaLocatedIn] if ' ' in l_rem: #print('l_rem:', l_rem) #embed() maybe_region, rest = l_rem.split(' ', 1) elif noneMembers(l_rem, *terminals) and not hrm: maybe_region, rest = l_rem, '' #print('MR:', maybe_region) else: #print(hrm) maybe_region = None if maybe_region: prefix_rank = ('UBERON', 'SWAN', 'BIRNLEX', 'SAO', 'NLXANAT') def key(ot): ranked = ot.prefix in prefix_rank arg = ot._query_result._QueryResult__query_args['term'].lower() return (not ranked, prefix_rank.index(ot.prefix) if ranked else 0, not (arg == ot.label.lower())) #t = OntTerm(term=maybe_region) # using query avoids the NoExplicitIdError ots = sorted((qr.OntTerm for qr in OntTerm.query(term=maybe_region, exclude_prefix=('FMA',))), key=key) if not ots: log.error(f'No match for {maybe_region!r}') else: t = ots[0] if 'oboInOwl:id' in t.predicates: # uberon replacement t = OntTerm(t.predicates['oboInOwl:id']) t.set_next_repr('curie', 'label') log.info(f'Match for {maybe_region!r} was {t!r}') if t.validated: l_rem = rest pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn) # FIXME pes += (pheno,) if pes: smatch.add(l) rem[l] = l_rem with Neuron(CUT.Mammalia): NeuronCUT(*zap(pes), id_=make_cut_id(l), label=l, override=True) labels_set3 = labels_set2 - smatch added_unmapped = sadded & labels_set3 # TODO preserve the names from neuronlex on import ... Neuron.write() Neuron.write_python() raw_neurons = config.neurons() config = Config('common-usage-types', sources=sources, source_file=relative_path(__file__), prefixes={'swanr':swanr, 'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'), 'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),}) ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons] assert len(ids_updated_neurons) == len(raw_neurons) Neuron.write() Neuron.write_python() progress = len(labels_set0), len(sns), len(sans), len(smatch), len(labels_set1), len(labels_set2), len(labels_set3) print('\nProgress:\n' f'total: {progress[0]}\n' f'from nlx: {progress[1]}\n' f'from basic: {progress[2]}\n' f'from match: {progress[3]}\n' f'TODO after nlx: {progress[4]}\n' f'TODO after basic: {progress[5]}\n' f'TODO after match: {progress[6]}\n') assert progress[0] == progress[1] + progress[4], 'neurolex does not add up' assert progress[4] == progress[2] + progress[5], 'basic does not add up' lnlx = set(n.lower() for n in snlx_labels) sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons) # FIXME load origLabel nlx_review = lnlx - sos nlx_missing = sorted(nlx_review) print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):') _ = [print(l) for l in nlx_missing] partial = {k:v for k, v in rem.items() if v and v not in terminals} print(f'\nPartially mapped (n = {len(partial)}):') if partial: mk = max((len(k) for k in partial.keys())) + 2 for k, v in sorted(partial.items()): print(f'{k:<{mk}} {v!r}') #print(f'{k!r:<{mk}}{v!r}') #pprint(partial, width=200) unmapped = sorted(labels_set3) print(f'\nUnmapped (n = {len(labels_set3)}):') _ = [print(l) for l in unmapped] if __name__ == '__main__': rows = export_for_review(config, unmapped, partial, nlx_missing) embed() return config, unmapped, partial, nlx_missing