Exemple #1
0
 def key(field_value):
     field, value = field_value
     try:
         return 0, valuesC.header._fields.index(field)  # TODO warn on field mismatch
     except ValueError as e:
         log.error(f'{field} {value}')
         return 1, 0
Exemple #2
0
def main():
    cs = [c() for c in sheet_classes]
    trips = [[cl] + [
        c.value for c in (r.neuron_id(), r.exact_location(), r.location_id())
    ] for cl in cs for r in cl.rows()
             if r.row_index > 0 and r.neuron_id().value and (
                 not hasattr(r, 'exclude') or not r.exclude().value)]

    dd = defaultdict(list)
    for c, _s, _p, _o in trips:
        for x in (_s, _p, _o):
            if re.match(x, r'(^[\s]+[^\s].*|.*[^\s][\s]+$)'):
                msg = 'leading/trailing whitespace in {c} {s} {p} {o}'
                log.error(msg)
                raise ValueError(msg)

        s = OntId(_s)
        try:
            p = map_predicates(_p)
        except KeyError as e:
            log.error(f'sigh {s}')
            raise e
        o = OntId(_o)
        try:
            dd[s].append(Phenotype(o, p))
        except TypeError as e:
            log.error(f'bad data for {c} {s} {p} {o}')
            raise e

    config = Config('apinat-simple-sheet')
    nrns = []
    for id, phenos in dd.items():
        n = NeuronApinatSimple(*phenos, id_=id)
        nrns.append(n)

    config.write()
    labels = (ilxtr.genLabel, ilxtr.localLabel, ilxtr.simpleLabel,
              ilxtr.simpleLocalLabel, rdfs.label, skos.prefLabel)
    to_remove = [t for t in config._written_graph if t[1] in labels]
    [config._written_graph.remove(t) for t in to_remove]
    config._written_graph.write()
    config.write_python()
    return config,
Exemple #3
0
    def convert_row(self, i, neuron_row, headers, expect_pes):
        id = None
        label_neuron  = None
        definition_neuron = None
        synonyms_neuron = None
        current_neuron = None
        phenotypes = []
        do_release = False
        predicate_notes = {}
        object_notes = {}
        other_notes = {}
        wat = {}
        def loop_internal(j, header, cell):
            nonlocal id
            nonlocal current_neuron
            nonlocal do_release
            notes = list(process_note(get_note(i + 1, j, self.cells_index)))  # + 1 since headers is removed
            if notes and not header.startswith('has'):
                _predicate = self.convert_other(header)
                if cell:
                    _object = rdflib.Literal(cell)  # FIXME curies etc.
                else:
                    _object = rdf.nil
                other_notes[_predicate, _object] = notes

            if header == 'curie':
                id = OntId(cell).u if cell else None
                return
            elif header == 'label':
                if id == OntId('NIFEXT:66').u:
                    breakpoint()
                label_neuron = cell
                if cell in self.existing:
                    current_neuron = self.existing[cell]
                elif cell:
                    # TODO
                    self.new.append(cell)
                else:
                    raise ValueError(cell)  # wat
                return
            elif header == 'Status':
                # TODO
                if cell == 'Yes':
                    do_release = True
                elif cell == 'Maybe':
                    pass
                elif cell == 'Not yet':
                    pass
                elif cell == 'Delete':
                    pass
                else:
                    pass

                return
            elif header == 'PMID':
                # TODO
                return
            elif header == 'Other reference':
                # TODO
                return
            elif header == 'Other label':
                # TODO
                return
            elif header == 'definition':
                return  # FIXME single space differences between the spreadsheet and the source

                if cell:
                    definition_neuron = rdflib.Literal(cell)

            elif header == 'synonyms':
                if cell:
                    synonyms_neuron = [rdflib.Literal(s.strip())
                                    # FIXME bare comma is extremely dangerous
                                    for s in cell.split(',')]

                return
            elif header in self.skip:
                return

            objects = []
            if cell:
                predicate = self.convert_header(header)
                if predicate is None:
                    log.debug(f'{(header, cell, notes)}')

                for object, label in self.convert_cell(cell):
                    if predicate in NeuronCUT._molecular_predicates:
                        if isinstance(object, tuple):
                            op, *rest = object
                            rest = [OntTerm(o).asIndicator().URIRef for o in rest]
                            object = op, *rest
                        elif object:
                            log.debug(f'{object!r}')
                            object = OntTerm(object).asIndicator().URIRef

                    if isinstance(label, tuple):  # LogicalPhenotype case
                        _err = []
                        for l in label:
                            if self.lower_check(l, cell):
                                _err.append((cell, label))
                        if _err:
                            self.errors.extend(_err)
                        else:
                            objects.append(object)
                    elif self.lower_check(label, cell):
                        self.errors.append((cell, label))
                    elif str(id) == object:
                        self.errors.append((header, cell, object, label))
                        object = None
                    else:
                        objects.append(object)

                if notes:
                    # FIXME this is a hack to only attach to the last value
                    # since we can't distinguish at the moment
                    wat[predicate, object] = notes
                    if object is not None:
                        # object aka iri can be none if we don't find anything
                        object_notes[object] = notes
                    else:
                        predicate_notes[predicate] = notes
                        # FIXME it might also be simpler in some cases
                        # to have this be object_notes[object] = notes
                        # because we are much less likely to have the same
                        # phenotype appear attached to the different dimensions

                        # FIXME comma sep is weak here because the
                        # reference is technically ambiguous
                        # might be an argument for the denormalized form ...
                        # or perhaps having another sheet for cases like that

            else:
                return

            if predicate and objects:
                for object in objects:  # FIXME has layer location phenotype
                    if isinstance(object, tuple):
                        op, *rest = object
                        pes = (Phenotype(r, predicate) for r in rest)  # FIXME nonhomogenous phenotypes
                        phenotypes.append(LogicalPhenotype(op, *pes))
                    elif object:
                        phenotypes.append(Phenotype(object, predicate))
                    else:
                        self.errors.append((object, predicate, cell))
            elif objects:
                self.errors.append((header, objects))
            else:
                self.errors.append((header, cell))
            # translate header -> predicate
            # translate cell value to ontology id

        #########################################

        for j, (header, cell) in enumerate(zip(headers, neuron_row)):
            loop_internal(j, header, cell)

        if current_neuron and phenotypes:
            # TODO merge current with changes
            # or maybe we just replace since all the phenotypes should be there?
            log.debug(phenotypes)
            if id is not None:
                log.debug(f'{(id, bool(id))}')

            elif label_neuron:
                id = make_cut_id(label_neuron)

            if id not in expect_pes:
                if id is not None:
                    log.error(f'{id!r} not in cuts!?')

                return

            phenotypes = sorted(set(phenotypes))
            _ep = expect_pes[id]
            if not allMembers(_ep, *phenotypes) and not neuron_row[self.tomqc_check_ind]:
            #if expect_pes[id] != len(phenotypes):
                # FIXME this is not a strict roundtrip, it may also include additions
                lp = len(phenotypes)
                lep = len(_ep)
                if lp == lep:
                    (pprint(sorted(_ep)))
                    (pprint(phenotypes))
                    pprint(set(_ep) - set(phenotypes))
                    pprint(set(phenotypes) - set(_ep))
                    _AAAAAA = id  # hack for debugger
                    print(_AAAAAA)
                log.error(f'{id!r} failed roundtrip {lp} != {lep}')
                self.failed[id] = phenotypes
                return

            neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron,
                               override=bool(id) or bool(label_neuron))
            neuron.adopt_meta(current_neuron)
            # FIXME occasionally this will error?!
            yield neuron

        else:
            return  # FIXME this polutes everything ???
        """
            fn = fixname(label_neuron)
            if not phenotypes and i:  # i skips header
                self.errors.append((i, neuron_row))  # TODO special review for phenos but not current
                phenotypes = Phenotype('TEMP:phenotype/' + fn),

            neuron = NeuronCUT(*phenotypes,
                               id_=make_cut_id(label_neuron),
                               label=label_neuron, override=True)

        """
        ###################################################

        # update the meta if there were any changes
        if definition_neuron is not None:
            neuron.definition = definition_neuron

        if synonyms_neuron is not None:
            neuron.synonyms = synonyms_neuron

        try:
            neuron.batchAnnotateByObject(object_notes)
            neuron.batchAnnotate(other_notes)
        except AttributeError as e:
            #breakpoint()
            log.exception(e) #'something very strage has happened\n', e)
            pass  # FIXME FIXME FIXME

        #neuron.batchAnnotateByPredicate(predicate_notes)  # TODO
        # FIXME doesn't quite work in this context, but there are other
        # cases where annotations to the general modality are still desireable
        # FIXME there may be no predicate? if the object fails to match?

        if do_release:
            self.release.append(neuron)
Exemple #4
0
def sheet_to_neurons(values, notes_index, expect_pes):
    # TODO import existing ids to register by label
    sgv = Vocabulary()
    e_config = Config('common-usage-types')
    e_config.load_existing()
    query = oq.OntQuery(oq.plugin.get('rdflib')(e_config.core_graph), instrumented=OntTerm)
    # FIXME clear use case for the remaining bound to whatever query produced it rather
    # than the other way around ... how to support this use case ...
    existing = {str(n.origLabel):n for n in e_config.neurons()}
    def convert_header(header):
        if header.startswith('has'):  # FIXME use a closed namespace
            return ilxtr[header]
        else:
            return None

    def convert_other(header):
        if header == 'label':
            return rdfs.label
        elif header == 'curie':
            return rdf.type
        elif header == 'definition':
            return definition
        else:
            header = header.replace(' ', '_')
            return TEMP[header]  # FIXME

    def mapCell(cell, syns=False):
        search_prefixes = ('UBERON', 'CHEBI', 'PR', 'NCBITaxon', 'NCBIGene', 'ilxtr', 'NIFEXT', 'SAO', 'NLXMOL',
                           'BIRNLEX',)

        if ':' in cell and ' ' not in cell:
            log.debug(cell)
            if 'http' in cell:
                if cell.startswith('http'):
                    t = OntTerm(iri=cell)
                else:
                    return None, None  # garbage with http inline
            else:
                t = OntTerm(cell, exclude_prefix=('FMA',))  # FIXME need better error message in ontquery

            return t.u, t.label

        result = [r for r in sgv.findByTerm(cell, searchSynonyms=syns, prefix=search_prefixes)
                  if not r['deprecated']]
        #printD(cell, result)
        if not result:
            log.debug(f'{cell}')
            maybe = list(query(label=cell, exclude_prefix=('FMA',)))
            if maybe:
                qr = maybe[0]
                return qr.OntTerm.u, qr.label
            elif not syns:
                return mapCell(cell, syns=True)
            else:
                return None, None
        elif len(result) > 1:
            #printD('WARNING', result)
            result = select_by_curie_rank(result)
        else:
            result = result[0]

        return rdflib.URIRef(result['iri']), result['labels'][0]

    def lower_check(label, cell):
        return label not in cell and label.lower() not in cell.lower()  # have to handle comma sep case

    lnlu = {v:k for k, v in LogicalPhenotype.local_names.items()}
    def convert_cell(cell_or_comma_sep):
        #printD('CONVERTING', cell_or_comma_sep)
        for cell_w_junk in cell_or_comma_sep.split(','):  # XXX WARNING need a way to alter people to this
            cell = cell_w_junk.strip()
            if cell.startswith('(OR') or cell.startswith('(AND'):
                start, *middle, end = cell.split('" "')
                OPoperator, first = start.split(' "')
                operator = OPoperator[1:]
                operator = lnlu[operator]
                last, CP = end.rsplit('"')
                iris, labels = [], []
                for term in (first, *middle, last):
                    iri, label = mapCell(term)
                    if label is None:
                        label = cell_or_comma_sep
                    iris.append(iri)
                    labels.append(label)

                yield (operator, *iris), tuple(labels)

            else:
                iri, label = mapCell(cell)
                if label is None:
                    yield iri, cell_or_comma_sep  # FIXME need a way to handle this that doesn't break things?
                else:
                    yield iri, label

    config = Config('cut-roundtrip')
    skip = 'alignment label',
    headers, *rows = values
    errors = []
    new = []
    release = []
    for i, neuron_row in enumerate(rows):
        id = None
        label_neuron  = None
        definition_neuron = None
        synonyms_neuron = None
        current_neuron = None
        phenotypes = []
        do_release = False
        predicate_notes = {}
        object_notes = {}
        other_notes = {}
        wat = {}
        for j, (header, cell) in enumerate(zip(headers, neuron_row)):
            notes = list(process_note(get_note(i + 1, j, notes_index)))  # + 1 since headers is removed
            if notes and not header.startswith('has'):
                _predicate = convert_other(header)
                if cell:
                    _object = rdflib.Literal(cell)  # FIXME curies etc.
                else:
                    _object = rdf.nil
                other_notes[_predicate, _object] = notes

            if header == 'curie':
                id = OntId(cell).u if cell else None
                continue
            elif header == 'label':
                label_neuron = cell
                if cell in existing:
                    current_neuron = existing[cell]
                elif cell:
                    # TODO
                    new.append(cell)
                else:
                    raise ValueError(cell)  # wat
                continue
            elif header == 'Status':
                # TODO
                if cell == 'Yes':
                    do_release = True
                elif cell == 'Maybe':
                    pass
                elif cell == 'Not yet':
                    pass
                elif cell == 'Delete':
                    pass
                else:
                    pass

                continue
            elif header == 'PMID':
                # TODO
                continue
            elif header == 'Other reference':
                # TODO
                continue
            elif header == 'Other label':
                # TODO
                continue
            elif header == 'definition':
                continue  # FIXME single space differences between the spreadsheet and the source

                if cell:
                    definition_neuron = rdflib.Literal(cell)

                continue

            elif header == 'synonyms':
                if cell:
                    synonyms_neuron = [rdflib.Literal(s.strip())
                                    # FIXME bare comma is extremely dangerous
                                    for s in cell.split(',')]

                continue
            elif header in skip:
                continue

            objects = []
            if cell:
                predicate = convert_header(header)
                if predicate is None:
                    log.debug(f'{(header, cell, notes)}')

                for object, label in convert_cell(cell):
                    if isinstance(label, tuple):  # LogicalPhenotype case
                        _err = []
                        for l in label:
                            if lower_check(l, cell):
                                _err.append((cell, label))
                        if _err:
                            errors.extend(_err)
                        else:
                            objects.append(object)
                    elif lower_check(label, cell):
                        errors.append((cell, label))
                    elif str(id) == object:
                        errors.append((header, cell, object, label))
                        object = None
                    else:
                        objects.append(object)

                if notes:
                    # FIXME this is a hack to only attach to the last value
                    # since we can't distinguish at the moment
                    wat[predicate, object] = notes
                    if object is not None:
                        # object aka iri can be none if we don't find anything
                        object_notes[object] = notes
                    else:
                        predicate_notes[predicate] = notes
                        # FIXME it might also be simpler in some cases
                        # to have this be object_notes[object] = notes
                        # because we are much less likely to have the same
                        # phenotype appear attached to the different dimensions

                        # FIXME comma sep is weak here because the
                        # reference is technically ambiguous
                        # might be an argument for the denormalized form ...
                        # or perhaps having another sheet for cases like that

            else:
                continue

            if predicate and objects:
                for object in objects:  # FIXME has layer location phenotype
                    if isinstance(object, tuple):
                        op, *rest = object
                        pes = (Phenotype(r, predicate) for r in rest)  # FIXME nonhomogenous phenotypes
                        phenotypes.append(LogicalPhenotype(op, *pes))
                    elif object:
                        phenotypes.append(Phenotype(object, predicate))
                    else:
                        errors.append((object, predicate, cell))
            elif objects:
                errors.append((header, objects))
            else:
                errors.append((header, cell))
            # translate header -> predicate
            # translate cell value to ontology id

        if current_neuron and phenotypes:
            # TODO merge current with changes
            # or maybe we just replace since all the phenotypes should be there?
            log.debug(phenotypes)
            if id is not None:
                log.debug(f'{(id, bool(id))}')

            elif label_neuron:
                id = make_cut_id(label_neuron)

            if id not in expect_pes:
                log.error(f'{id!r} not in cuts!?')
                continue

            if expect_pes[id] != len(phenotypes):
                log.error(f'{id!r} failed roundtrip {len(phenotypes)} != {expect_pes[id]}')
                continue

            neuron = NeuronCUT(*phenotypes, id_=id, label=label_neuron,
                               override=bool(id) or bool(label_neuron))
            neuron.adopt_meta(current_neuron)
            # FIXME occasionally this will error?!
        else:
            continue  # FIXME this polutes everything ???
            fn = fixname(label_neuron)
            if not phenotypes and i:  # i skips header
                errors.append((i, neuron_row))  # TODO special review for phenos but not current
                phenotypes = Phenotype('TEMP:phenotype/' + fn),

            neuron = NeuronCUT(*phenotypes,
                               id_=make_cut_id(label_neuron),
                               label=label_neuron, override=True)

        # update the meta if there were any changes
        if definition_neuron is not None:
            neuron.definition = definition_neuron

        if synonyms_neuron is not None:
            neuron.synonyms = synonyms_neuron

        try:
            neuron.batchAnnotateByObject(object_notes)
            neuron.batchAnnotate(other_notes)
        except AttributeError as e:
            #embed()
            log.exception(e) #'something very strage has happened\n', e)
            pass  # FIXME FIXME FIXME

        #neuron.batchAnnotateByPredicate(predicate_notes)  # TODO
        # FIXME doesn't quite work in this context, but there are other
        # cases where annotations to the general modality are still desireable
        # FIXME there may be no predicate? if the object fails to match?

        if do_release:
            release.append(neuron)

    return config, errors, new, release