コード例 #1
0
    def normalize(cls, value):
        _ovalue = value
        value = super().normalize(value, preserve_case=True)
        if 'OT2' in value and 'OD' not in value:
            # one is missing the OD >_<
            log.warning(value)
            value = value.replace('-', '-OD')  # hack

        n = (
            value.strip().replace('-', '-')  # can you spot the difference?
            .replace('(', '').replace(')', '').replace('-01S1', '').replace(
                '-01', '').replace('-02S2', '').replace('-02', '').replace(
                    'SPARC',
                    '').replace('NIH-1', '').replace('NIH-', '').replace(
                        '-', '').replace('NIH ', '').replace(' ', ''))
        if n[0] in ('1', '3', '5'):
            n = n[1:]

        if n.endswith('S2'):
            n = n[:-2]

        if n.endswith('D23864'):  # FIXME another trailing zero
            log.critical(_ovalue)
            n = n.replace('D23864', 'D023864')

        if n != _ovalue:
            log.debug(f'\n{_ovalue}\n{n}')
        return n
コード例 #2
0
 def organ(self, award_number):
     if award_number in self.manual and award_number not in self.sourced:
         log.warning(f'used manual organ mapping for {award_number}')
     try:
         return self.award_to_organ[award_number]
     except KeyError as e:
         logd.error(f'bad award_number {award_number}')
コード例 #3
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'name' in contributor and 'first_name' in contributor:
            name = contributor['name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = OrcidId(orcid)  # FIXME reloading from json

            if isinstance(orcid, OrcidId):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)
コード例 #4
0
ファイル: curation.py プロジェクト: osbornepb/sparc-curation
 def _submission_objects(self):
     for p in self.submission_paths:
         try:
             miss = dat.SubmissionFile(p)
             if miss.data:
                 yield miss
         except exc.NoDataError as e:
             self._errors.append(e)  # NOTE we treat empty file as no file
         except AttributeError as e:
             log.warning(f'unhandled metadata type {e!r}')
             self._errors.append(e)
コード例 #5
0
ファイル: curation.py プロジェクト: osbornepb/sparc-curation
 def _samples_objects(self):
     """ really samples_file """
     for path in self.samples_paths:
         try:
             sf = dat.SamplesFile(path)
             if sf.data:
                 yield sf
         except exc.NoDataError as e:
             self._errors.append(e)  # NOTE we treat empty file as no file
         except AttributeError as e:
             log.warning(f'unhandled metadata type {e!r}')
             self._errors.append(e)
コード例 #6
0
ファイル: curation.py プロジェクト: osbornepb/sparc-curation
 def _dataset_description_objects(self):
     for p in self.dataset_description_paths:
         #yield from DatasetDescription(t)
         # TODO export adapters for this ... how to recombine and reuse ...
         try:
             dd = dat.DatasetDescriptionFile(p)
             if dd.data:
                 yield dd
         except exc.NoDataError as e:
             self._errors.append(e)  # NOTE we treat empty file as no file
         except AttributeError as e:
             log.warning(f'unhandled metadata type {e!r}')
             self._errors.append(e)
コード例 #7
0
ファイル: datasets.py プロジェクト: tmsincomb/sparc-curation
    def query(value, prefix):
        for query_type in ('term', 'search'):
            terms = [q.OntTerm for q in OntTerm.query(prefix=prefix, **{query_type:value})]
            if terms:
                #print('matching', terms[0], value)
                #print('extra terms for', value, terms[1:])
                return terms[0]
            else:
                continue

        else:
            log.warning(f'No ontology id found for {value}')
            return value
コード例 #8
0
ファイル: curation.py プロジェクト: tmsincomb/sparc-curation
 def __protocol_uris(self):
     """ property needed for protocol helper to help us """
     #if not hasattr(self, '_puri_cache'):
     p = 'protocol_url_or_doi'
     for dd in self.dataset_description:
         dwe = dd.data_with_errors
         if p in dwe:
             for uri in dwe[p]:
                 if uri.startswith('http'):
                     # TODO normalize
                     yield uri
                 else:
                     log.warning(f"protocol not uri {uri} '{self.id}'")
コード例 #9
0
    def bf_size(self):
        size = self._meta.size
        if size:
            return size
        elif self.path.is_dir():
            size = 0
            for path in self.path.rglob('*'):
                if path.is_file():
                    try:
                        size += path.cache.meta.size
                    except OSError as e:
                        log.warning(f'No cached file size. Assuming it is not tracked. {path}')

            return size

        else:
            log.warning(f'unknown thing at path {self.path}')
コード例 #10
0
ファイル: curation.py プロジェクト: osbornepb/sparc-curation
        def triples_gen(prefix_func, samples):
            for i, sample in enumerate(samples):
                converter = conv.SampleConverter(sample)
                if 'sample_id' in sample:
                    s_local = sample['sample_id']
                else:
                    s_local = f'local-{i + 1}'  # sigh

                s = prefix_func(s_local)
                yield s, a, owl.NamedIndividual
                yield s, a, sparc.Sample
                yield from converter.triples_gen(s)
                continue
                for field, value in sample.items():
                    convert = getattr(converter, field, None)
                    if convert is not None:
                        yield (s, *convert(value))
                    elif field not in converter.known_skipped:
                        log.warning(f'Unhandled sample field: {field}')
コード例 #11
0
ファイル: curation.py プロジェクト: osbornepb/sparc-curation
    def triples(self):
        # FIXME ick
        data = self.data
        try:
            dsid = self.uri_api
        except BaseException as e:  # FIXME ...
            raise e
            return

        if 'meta' in data:
            meta_converter = conv.MetaConverter(data['meta'], self)
            yield from meta_converter.triples_gen(dsid)
        else:
            log.warning(
                f'{self} has no meta!'
            )  # FIXME split logs into their problems, and our problems

        if 'status' not in data:
            breakpoint()

        yield from conv.StatusConverter(data['status'], self).triples_gen(dsid)

        #converter = conv.DatasetConverter(data)
        #yield from converter.triples_gen(dsid)

        def id_(v):
            s = rdflib.URIRef(dsid)
            yield s, a, owl.NamedIndividual
            yield s, a, sparc.Resource
            yield s, rdfs.label, rdflib.Literal(
                self.folder_name)  # not all datasets have titles

        yield from id_(self.id)

        #for subjects in self.subjects:
        #for s, p, o in subjects.triples_gen(subject_id):
        #if type(s) == str:
        #breakpoint()
        #yield s, p, o

        yield from self.ddt(data)
        yield from self.triples_subjects
        yield from self.triples_samples
コード例 #12
0
ファイル: datasets.py プロジェクト: tmsincomb/sparc-curation
    def xlsx(self):
        kwargs = {
            'delimiter' : '\t',
            'skip_empty_lines' : True,
            'outputencoding': 'utf-8',
        }
        sheetid = 0
        xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs)

        f = io.StringIO()
        try:
            xlsx2csv.convert(f, sheetid)
            f.seek(0)
            gen = csv.reader(f, delimiter='\t')
            # avoid first row sheet line
            next(gen)
            yield from gen
        except SheetNotFoundException as e:
            log.warning(f'Sheet weirdness in{self.path}')
            log.warning(str(e))
コード例 #13
0
ファイル: datasets.py プロジェクト: tmsincomb/sparc-curation
            def check_fordd(paths, level=0, stop=3):
                if not paths:  # apparently the empty case recurses forever
                    return

                if len(paths) > self.max_childs:
                    log.warning(f'Not globing in a folder with > {self.max_childs} children! '
                                f'{self.as_posix()!r}')
                    return
                dd_paths_all = []
                children = []
                for path in paths:
                    dd_paths = list(path.glob('[Dd]ataset_description*.*'))
                    if dd_paths:
                        dd_paths_all.extend(dd_paths)
                    elif not dd_paths_all:
                        children.extend([p for p in path.children if p.is_dir()])

                if dd_paths_all:
                    return dd_paths_all
                else:
                    return check_fordd(children, level + 1)
コード例 #14
0
ファイル: datasets.py プロジェクト: osbornepb/sparc-curation
    def xlsx(self):
        kwargs = {
            'delimiter' : '\t',
            'skip_empty_lines' : True,
            'outputencoding': 'utf-8',
        }
        sheetid = 1
        xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs)
        ns = len(xlsx2csv.workbook.sheets)
        if ns > 1:
            message = f'too many sheets ({ns}) in {self.path.as_posix()!r}'
            self.addError(exc.EncodingError(message))
            logd.error(message)

        f = io.StringIO()
        try:
            xlsx2csv.convert(f, sheetid)
            f.seek(0)
            gen = csv.reader(f, delimiter='\t')
            yield from gen
        except SheetNotFoundException as e:
            log.warning(f'Sheet weirdness in{self.path}')
            log.warning(str(e))
コード例 #15
0
    def validate_structure(path, dir_structure, subjects, samples):

        he = HasErrors(pipeline_stage='Derives.validate_structure')

        # FIXME TODO handle pools as well and figure out cases where subjects/samples are metadata only

        # for dataset templates of the 1.* series
        # general approach: set of all specimen ids and set of all
        # folder names take the ones that match ignore the known ok
        # that do not, and warn on all the rest that do not match
        # and that are not inside a known specimen or subject folder

        valid_top_123 = (
            'source',
            'primary',
            'derivative',  # FIXME not here :/ schema somehow?
            'code',
            'docs',
            'protocol')

        def top_level(drp):
            return drp.parent.name == '' and drp.name in valid_top_123

        # absolute_paths = [path / pblob['dataset_relative_path'] for pblob in dir_structure]
        dd = defaultdict(list)
        for pblob in dir_structure:
            drp = pblob['dataset_relative_path']
            p = drp.parts
            dd[p[-1]].append((len(p), drp, p[::-1]))

        dirs = {
            k: av
            for k, vs in dd.items()
            for av in ([v for v in vs if not top_level(v[1])], )
            # cull empty in a single step
            if av
        }

        # subject_id could be missing, but we filter failures on all of
        # those so in theory we shouldn't need to handle it as this stage
        subs = {s['subject_id']: s for s in subjects}
        dd = defaultdict(list)
        for s in samples:
            dd[s['sample_id']].append(s)
        samps = dict(dd)

        union_sub = set(dirs) | set(subs)
        inter_sub = set(dirs) & set(subs)

        records = []
        done_dirs = set()
        done_specs = set()
        if inter_sub == set(subs):
            for subject_id, blob in subs.items():
                done_dirs.add(subject_id)
                done_specs.add(subject_id)
                records.append({
                    'type': 'SubjectDirs',
                    # have to split the type because we can't recover
                    # the type using just the specimen id (sigh)
                    # and we need it to set the correct prefix (sigh)
                    'specimen_id': subject_id,
                    'dirs': [d[1] for d in dirs[subject_id]]
                })
        else:
            # FIXME not all subjects have folders there may be samples
            # that have folders but not subjects ??? don't wan't to force
            # metadata structure onto folder structure but it complicates
            # the implementation again ... probably worth it in this case
            logd.warning('miscount subject dirs, TODO')
            pass

        union_sam = set(dirs) | set(samps)
        inter_sam = set(dirs) & set(samps)

        template_version_less_than_2 = True  # FIXME TODO
        # FIXME this is where non-uniqueness of sample ids becomes a giant pita
        if inter_sam == set(samps):
            for sample_id, blob in samps.items():
                if len(blob) > 1:
                    # FIXME TODO this means that we need to fail over to the primary keys
                    msg = f'sample_id is not unique! {sample_id}\n{blob}'
                    if he.addError(msg, blame='submission', path=path):
                        logd.error(msg)
                    continue

                if template_version_less_than_2:  # FIXME this is sure the cause an error at some point
                    done_dirs.add((blob[0]['subject_id'], sample_id))
                    done_specs.add(blob[0]['primary_key'])
                else:
                    done_dirs.add(sample_id)
                    done_specs.add(sample_id)
                records.append({
                    'type': 'SampleDirs',
                    # have to split the type because we can't recover
                    # the type using just the specimen id (sigh)
                    # and we need it to set the correct prefix (sigh)
                    'specimen_id': sample_id,
                    'dirs': [d[1] for d in dirs[sample_id]]
                })
        else:
            logd.warning('miscount sample dirs, TODO')
            bad_dirs = []
            if template_version_less_than_2:
                # handle old aweful nonsense
                # 1. construct subject sample lookups using tuple
                # 2. try to construct subject sample id pairs
                for sample_id, blobs in samps.items():
                    for blob in blobs:
                        if sample_id in dirs:
                            candidates = dirs[sample_id]
                            # TODO zero candidates error
                            actual = []
                            for level, drp, rparts in candidates:
                                if level < 2:
                                    msg = (
                                        f'Bad location for specimen folder! {drp}'
                                    )
                                    if he.addError(msg,
                                                   blame='submission',
                                                   path=path):
                                        logd.error(msg)
                                    bad_dirs.append(dirs.pop(sample_id))
                                    continue
                                p_sample_id, p_subject_id, *p_rest = rparts
                                if level < 3:
                                    # p_subject_id will be primary derivatie or source
                                    log.warning(f'TODO new structure {drp}')

                                assert sample_id == p_sample_id  # this should always be true
                                subject_id = blob['subject_id']
                                if subject_id == p_subject_id:
                                    id = blob['primary_key']
                                    done_dirs.add((subject_id, p_sample_id))
                                    done_specs.add(id)
                                    actual.append(drp)

                            if actual:
                                records.append({
                                    'type': 'SampleDirs',
                                    # have to split the type because we can't recover
                                    # the type using just the specimen id (sigh)
                                    # and we need it to set the correct prefix (sigh)
                                    'specimen_id': id,
                                    'dirs': actual,
                                })
                    else:
                        msg = f'No folder for sample {sample_id}'
                        if he.addError(msg, blame='submission', path=path):
                            logd.error(msg)
            else:
                pass  # TODO that's an error!

        usamps = set(v['primary_key'] for vs in samps.values() for v in vs)
        udirs = set(
            nv for path_name, subpaths in dirs.items()
            for nv in (((subpath[-1][1], path_name) for subpath in subpaths
                        )  # -1 rpaths 1 parent  # XXX FIXME clearly wrong ???
                       if path_name in samps else (path_name, )))
        not_done_specs = (set(subs) | usamps) - set(done_specs)
        not_done_dirs = set(udirs) - set(done_dirs)

        obj = {}

        if records:
            obj['records'] = records
        else:
            pass  # TODO embed an error

        if not_done_specs:
            msg = ('There are specimens that have no corresponding '
                   f'directory!\n{not_done_specs}')
            if he.addError(msg, blame='submission', path=path):
                logd.error(msg)

        if not_done_dirs:
            msg = ('There are directories that have no corresponding '
                   f'specimen!\n{not_done_dirs}')
            if he.addError(msg, blame='submission', path=path):
                logd.error(msg)

        he.embedErrors(obj)
        return obj,
コード例 #16
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'contributor_name' in contributor and 'first_name' in contributor:
            name = contributor['contributor_name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                mn, _mn = mn.rstrip('.'), mn
                if mn != _mn:
                    he.addError(f'Middle initials don\'t need periods :) {name!r}',
                                logfunc=logd.error)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = idlib.Orcid(orcid)  # FIXME reloading from json

            if isinstance(orcid, idlib.Orcid):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)

        # lifting + adding
        if 'contributor_affiliation' in contributor:
            ca = contributor['contributor_affiliation']
            maybe_ror = self.lifters.affiliations(ca)
            if maybe_ror is not None:
                contributor['affiliation'] = maybe_ror