def organ(self, award_number): if award_number in self.manual and award_number not in self.sourced: log.warning(f'used manual organ mapping for {award_number}') try: return self.award_to_organ[award_number] except KeyError as e: logd.error(f'bad award_number {award_number}')
def get(self, uri): #juri = uri + '.json' logd.info(uri) log.debug('going to network for protocols') resp = requests.get(uri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def path_metadata(cls, path_dataset, manifests, xmls): path_metadata = [] scaffolds = [ ] # FIXME need a better abstraction for additional known types e.g. the mbf segmentations for manifest in manifests: if 'contents' in manifest: contents = manifest['contents'] if 'manifest_records' in contents: drp = manifest['dataset_relative_path'] _should_log = True for record in contents['manifest_records']: try: lifted, _should_log = cls._lift_mr( path_dataset, drp, record, _should_log) except FileNotFoundError as e: logd.error(e) continue # FIXME need this in the errors record except exc.BadDataError as e: logd.error(e) continue # FIXME need this in the errors record path_metadata.append(lifted) if 'errors' not in lifted: cls._scaffolds(lifted, scaffolds) path_metadata.extend(xmls['xml']) # TODO try to construct/resolve the referent paths in these datasets as well # getting the path json metadata will embed errors about missing files for us return path_metadata, scaffolds
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'name' in contributor and 'first_name' in contributor: name = contributor['name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = OrcidId(orcid) # FIXME reloading from json if isinstance(orcid, OrcidId): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor)
def _lift_mr(path_dataset, dataset_relative_path, record, should_log): parent = dataset_relative_path.parent if 'filename' not in record: msg = f'filename missing from record in {dataset_relative_path}' raise exc.BadDataError(msg) record_drp = parent / record['filename'] # FIXME this is validate move elsewhere ??? but this is also # where we want to embed data about the path ... # but I supposed we could inject errors later ? this is an # extremely obscure place to inject these ... # and to do that we need access to the path # and it is clear given that we passed in THIS_PATH # FIXME TODO how to deal with sparse datasets _record_path = path_dataset / record_drp # do not include in export _cache = _record_path.cache if _cache is None: lifted = _record_path._jsonMetadata( ) # will produce the error for us else: lifted = _cache._jsonMetadata() if 'errors' in lifted: he = HasErrors(pipeline_stage='Derives._lift_mr') _message = lifted['errors'].pop()['message'] # FIXME pretty sure that path in addError is used for both # the json path and the file system path message = ('Non-existent path listed in manifest ' f'{dataset_relative_path}\n{_message}') if he.addError(message, blame='submission', path=_record_path): if should_log: should_log = False logd.error(message) he.embedErrors(lifted) lifted['prov:wasDerivedFrom'] = ( # have to reattach path_dataset because Path.cwd() may not be # the dataset root (usually the organizaiton root) path_dataset / dataset_relative_path).cache_identifier lifted['dataset_relative_path'] = record_drp lifted['manifest_record'] = { k: v for k, v in record.items() if k != 'filetype' } if 'additional_types' in record: # FIXME TODO make sure that the mimetypes match # FIXME currently a string, either change name or make a list? lifted['mimetype'] = record['additional_types'] return lifted, should_log
def subpipeline_errors(self, errors): paths = [] saf = ['samples_file'] suf = ['subjects_file'] for_super = [] for path, error, subpipeline_class in errors: paths.append(path) if path not in (saf, suf): for_super.append((path, error, subpipeline_class)) if saf not in paths and suf not in paths: logd.error(f'samples_file nor subjects_file') super().subpipeline_errors(for_super)
def _abstracted_paths(self, name_prefix, glob_type=None): """ A bottom up search for the closest file in the parent directory. For datasets, if the bids root and path do not match, use the bids root. In the future this needs to be normalized because the extra code required for dealing with the intervening node is quite annoying to maintain. """ if glob_type is None: glob_type = self.default_glob path = self if (self.cache and self.cache.is_dataset and self.bids_root is not None and self.bids_root != self): path = self.bids_root first = name_prefix[0] cased_np = '[' + first.upper() + first + ']' + name_prefix[1:] # FIXME warn and normalize glob = getattr(path, glob_type) gen = glob(cased_np + '*.*') try: path = next(gen) for path in chain((path,), gen): if path.is_broken_symlink(): log.info(f'fetching unretrieved metadata path {path.as_posix()!r}' '\nFIXME batch these using async in cli export ...') path.cache.fetch(size_limit_mb=path.cache.meta.size.mb + 1) if path.suffix in path.stem: msg = f'path has duplicate suffix {path.as_posix()!r}' self.addError(msg) logd.error(msg) if path.name[0].isupper(): msg = f'path has bad casing {path.as_posix()!r}' self.addError(msg) logd.error(msg) yield path except StopIteration: if (self.cache.parent.meta is not None and self.parent.cache != self.cache.anchor and self.parent != self): yield from getattr(self.parent, name_prefix + '_paths')
def contributor_orcid_id(self, value): # FIXME use schema v = value.replace(' ', '') if not v: return if v.startswith('http:'): v = v.replace('http:', 'https:', 1) if not (v.startswith('ORCID:') or v.startswith('https:')): v = v.strip() if not len(v): return elif v == '0': # FIXME ? someone using strange conventions ... return elif len(v) != 19: msg = f'orcid wrong length {value!r} {self.t.path.as_posix()!r}' self.addError(OrcidId.OrcidLengthError(msg)) logd.error(msg) return v = 'ORCID:' + v else: if v.startswith('https:'): _, numeric = v.rsplit('/', 1) elif v.startswith('ORCID:'): _, numeric = v.rsplit(':', 1) if not len(numeric): return elif len(numeric) != 19: msg = f'orcid wrong length {value!r} {self.t.path.as_posix()!r}' self.addError(OrcidId.OrcidLengthError(msg)) logd.error(msg) return try: #log.debug(f"{v} '{self.t.path}'") orcid = OrcidId(v) if not orcid.checksumValid: # FIXME json schema can't do this ... msg = f'orcid failed checksum {value!r} {self.t.path.as_posix()!r}' self.addError(OrcidId.OrcidChecksumError(msg)) logd.error(msg) return yield orcid except (OntId.BadCurieError, OrcidId.OrcidMalformedError) as e: msg = f'orcid malformed {value!r} {self.t.path.as_posix()!r}' self.addError(OrcidId.OrcidMalformedError(msg)) logd.error(msg) yield value
def csv(self, delimiter=','): for encoding in ('utf-8', 'latin-1'): try: with open(self.path, 'rt', encoding=encoding) as f: for row in csv.reader(f, delimiter=delimiter): if row: yield row else: message = f'empty row in {self.path.as_posix()!r}' self.addError(message) logd.error(message) if encoding != 'utf-8': message = f'encoding bad {encoding!r} {self.path.as_posix()!r}' self.addError(exc.EncodingError(message)) logd.error(message) return except UnicodeDecodeError: continue
def xlsx(self): kwargs = { 'delimiter' : '\t', 'skip_empty_lines' : True, 'outputencoding': 'utf-8', } sheetid = 1 xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs) ns = len(xlsx2csv.workbook.sheets) if ns > 1: message = f'too many sheets ({ns}) in {self.path.as_posix()!r}' self.addError(exc.EncodingError(message)) logd.error(message) f = io.StringIO() try: xlsx2csv.convert(f, sheetid) f.seek(0) gen = csv.reader(f, delimiter='\t') yield from gen except SheetNotFoundException as e: log.warning(f'Sheet weirdness in{self.path}') log.warning(str(e))
def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info( uri.identifier if isinstance(uri, idlib.Stream) else uri) # FIXME pi = idlib.get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://www.protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def data(self): # TODO candidate for memory.cache if hasattr(self, '_data_cache'): return self._data_cache index_col, *_ = self.to_index out = {} if not hasattr(self.bc, index_col): msg = f'{self.path.as_posix()!r} maformed header!' self.addError(msg) logd.error(msg) self.embedErrors(out) self._data_cache = out return out ic = list(getattr(self.bc, index_col)) nme = Header(ic).data nmed = {v:normk for normk, v in zip(nme, ic)} for v, nt in self.bc._byCol__indexes[index_col].items(): if v != index_col: normk = nmed[v] if normk not in self.skip_rows: _value = tuple(normv for key, value in zip(nt._fields, nt) if key not in self.skip_cols and value for normv in self.normalize(normk, value) if normv) value = tuple(set(_value)) if len(value) != len(_value): # TODO counter to show the duplicate values msg = f'duplicate values in {normk} TODO {self.path.as_posix()!r}' self.addError(msg) logd.error(msg) if normk in self.max_one: # schema will handle this .. if not value: #log.warning(f"missing value for {normk} '{self.t.path}'") pass elif len(value) > 1: msg = f'too many values for {normk} {value} {self.path.as_posix()!r}' self.addError(msg) logd.error(msg) # FIXME not selecting the zeroth element here breaks the schema assumptions #value = 'AAAAAAAAAAA' + '\n|AAAAAAAAAAA|\n'.join(value) #value = 'ERROR>>>' + ','.join(value) # just leave it else: value = value[0] # FIXME error handling etc. if value: out[normk] = value def merge(tup): out = {} for a, b in tup: if a not in out: out[a] = b elif a and not isinstance(b, tuple): out[a] = out[a], b else: out[a] += b, self._data_cache = out return out for key, keys in self.verticals.items(): gen = (merge([(self.rename_key(k, key), normv) for k, value in zip(nme, values) if k in keys and value for normv in self.normalize(k, value) if normv]) for head, *values in self.bc.cols if head not in self.skip_cols) value = tuple(_ for _ in gen if _) if value: out[key] = value self.embedErrors(out) self._data_cache = out return out
def validate_structure(path, dir_structure, subjects, samples): he = HasErrors(pipeline_stage='Derives.validate_structure') # FIXME TODO handle pools as well and figure out cases where subjects/samples are metadata only # for dataset templates of the 1.* series # general approach: set of all specimen ids and set of all # folder names take the ones that match ignore the known ok # that do not, and warn on all the rest that do not match # and that are not inside a known specimen or subject folder valid_top_123 = ( 'source', 'primary', 'derivative', # FIXME not here :/ schema somehow? 'code', 'docs', 'protocol') def top_level(drp): return drp.parent.name == '' and drp.name in valid_top_123 # absolute_paths = [path / pblob['dataset_relative_path'] for pblob in dir_structure] dd = defaultdict(list) for pblob in dir_structure: drp = pblob['dataset_relative_path'] p = drp.parts dd[p[-1]].append((len(p), drp, p[::-1])) dirs = { k: av for k, vs in dd.items() for av in ([v for v in vs if not top_level(v[1])], ) # cull empty in a single step if av } # subject_id could be missing, but we filter failures on all of # those so in theory we shouldn't need to handle it as this stage subs = {s['subject_id']: s for s in subjects} dd = defaultdict(list) for s in samples: dd[s['sample_id']].append(s) samps = dict(dd) union_sub = set(dirs) | set(subs) inter_sub = set(dirs) & set(subs) records = [] done_dirs = set() done_specs = set() if inter_sub == set(subs): for subject_id, blob in subs.items(): done_dirs.add(subject_id) done_specs.add(subject_id) records.append({ 'type': 'SubjectDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': subject_id, 'dirs': [d[1] for d in dirs[subject_id]] }) else: # FIXME not all subjects have folders there may be samples # that have folders but not subjects ??? don't wan't to force # metadata structure onto folder structure but it complicates # the implementation again ... probably worth it in this case logd.warning('miscount subject dirs, TODO') pass union_sam = set(dirs) | set(samps) inter_sam = set(dirs) & set(samps) template_version_less_than_2 = True # FIXME TODO # FIXME this is where non-uniqueness of sample ids becomes a giant pita if inter_sam == set(samps): for sample_id, blob in samps.items(): if len(blob) > 1: # FIXME TODO this means that we need to fail over to the primary keys msg = f'sample_id is not unique! {sample_id}\n{blob}' if he.addError(msg, blame='submission', path=path): logd.error(msg) continue if template_version_less_than_2: # FIXME this is sure the cause an error at some point done_dirs.add((blob[0]['subject_id'], sample_id)) done_specs.add(blob[0]['primary_key']) else: done_dirs.add(sample_id) done_specs.add(sample_id) records.append({ 'type': 'SampleDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': sample_id, 'dirs': [d[1] for d in dirs[sample_id]] }) else: logd.warning('miscount sample dirs, TODO') bad_dirs = [] if template_version_less_than_2: # handle old aweful nonsense # 1. construct subject sample lookups using tuple # 2. try to construct subject sample id pairs for sample_id, blobs in samps.items(): for blob in blobs: if sample_id in dirs: candidates = dirs[sample_id] # TODO zero candidates error actual = [] for level, drp, rparts in candidates: if level < 2: msg = ( f'Bad location for specimen folder! {drp}' ) if he.addError(msg, blame='submission', path=path): logd.error(msg) bad_dirs.append(dirs.pop(sample_id)) continue p_sample_id, p_subject_id, *p_rest = rparts if level < 3: # p_subject_id will be primary derivatie or source log.warning(f'TODO new structure {drp}') assert sample_id == p_sample_id # this should always be true subject_id = blob['subject_id'] if subject_id == p_subject_id: id = blob['primary_key'] done_dirs.add((subject_id, p_sample_id)) done_specs.add(id) actual.append(drp) if actual: records.append({ 'type': 'SampleDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': id, 'dirs': actual, }) else: msg = f'No folder for sample {sample_id}' if he.addError(msg, blame='submission', path=path): logd.error(msg) else: pass # TODO that's an error! usamps = set(v['primary_key'] for vs in samps.values() for v in vs) udirs = set( nv for path_name, subpaths in dirs.items() for nv in (((subpath[-1][1], path_name) for subpath in subpaths ) # -1 rpaths 1 parent # XXX FIXME clearly wrong ??? if path_name in samps else (path_name, ))) not_done_specs = (set(subs) | usamps) - set(done_specs) not_done_dirs = set(udirs) - set(done_dirs) obj = {} if records: obj['records'] = records else: pass # TODO embed an error if not_done_specs: msg = ('There are specimens that have no corresponding ' f'directory!\n{not_done_specs}') if he.addError(msg, blame='submission', path=path): logd.error(msg) if not_done_dirs: msg = ('There are directories that have no corresponding ' f'specimen!\n{not_done_dirs}') if he.addError(msg, blame='submission', path=path): logd.error(msg) he.embedErrors(obj) return obj,
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'contributor_name' in contributor and 'first_name' in contributor: name = contributor['contributor_name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) mn, _mn = mn.rstrip('.'), mn if mn != _mn: he.addError(f'Middle initials don\'t need periods :) {name!r}', logfunc=logd.error) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = idlib.Orcid(orcid) # FIXME reloading from json if isinstance(orcid, idlib.Orcid): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor) # lifting + adding if 'contributor_affiliation' in contributor: ca = contributor['contributor_affiliation'] maybe_ror = self.lifters.affiliations(ca) if maybe_ror is not None: contributor['affiliation'] = maybe_ror