def normalize(cls, value): _ovalue = value value = super().normalize(value, preserve_case=True) if 'OT2' in value and 'OD' not in value: # one is missing the OD >_< log.warning(value) value = value.replace('-', '-OD') # hack n = ( value.strip().replace('-', '-') # can you spot the difference? .replace('(', '').replace(')', '').replace('-01S1', '').replace( '-01', '').replace('-02S2', '').replace('-02', '').replace( 'SPARC', '').replace('NIH-1', '').replace('NIH-', '').replace( '-', '').replace('NIH ', '').replace(' ', '')) if n[0] in ('1', '3', '5'): n = n[1:] if n.endswith('S2'): n = n[:-2] if n.endswith('D23864'): # FIXME another trailing zero log.critical(_ovalue) n = n.replace('D23864', 'D023864') if n != _ovalue: log.debug(f'\n{_ovalue}\n{n}') return n
def organ(self, award_number): if award_number in self.manual and award_number not in self.sourced: log.warning(f'used manual organ mapping for {award_number}') try: return self.award_to_organ[award_number] except KeyError as e: logd.error(f'bad award_number {award_number}')
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'name' in contributor and 'first_name' in contributor: name = contributor['name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = OrcidId(orcid) # FIXME reloading from json if isinstance(orcid, OrcidId): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor)
def _submission_objects(self): for p in self.submission_paths: try: miss = dat.SubmissionFile(p) if miss.data: yield miss except exc.NoDataError as e: self._errors.append(e) # NOTE we treat empty file as no file except AttributeError as e: log.warning(f'unhandled metadata type {e!r}') self._errors.append(e)
def _samples_objects(self): """ really samples_file """ for path in self.samples_paths: try: sf = dat.SamplesFile(path) if sf.data: yield sf except exc.NoDataError as e: self._errors.append(e) # NOTE we treat empty file as no file except AttributeError as e: log.warning(f'unhandled metadata type {e!r}') self._errors.append(e)
def _dataset_description_objects(self): for p in self.dataset_description_paths: #yield from DatasetDescription(t) # TODO export adapters for this ... how to recombine and reuse ... try: dd = dat.DatasetDescriptionFile(p) if dd.data: yield dd except exc.NoDataError as e: self._errors.append(e) # NOTE we treat empty file as no file except AttributeError as e: log.warning(f'unhandled metadata type {e!r}') self._errors.append(e)
def query(value, prefix): for query_type in ('term', 'search'): terms = [q.OntTerm for q in OntTerm.query(prefix=prefix, **{query_type:value})] if terms: #print('matching', terms[0], value) #print('extra terms for', value, terms[1:]) return terms[0] else: continue else: log.warning(f'No ontology id found for {value}') return value
def __protocol_uris(self): """ property needed for protocol helper to help us """ #if not hasattr(self, '_puri_cache'): p = 'protocol_url_or_doi' for dd in self.dataset_description: dwe = dd.data_with_errors if p in dwe: for uri in dwe[p]: if uri.startswith('http'): # TODO normalize yield uri else: log.warning(f"protocol not uri {uri} '{self.id}'")
def bf_size(self): size = self._meta.size if size: return size elif self.path.is_dir(): size = 0 for path in self.path.rglob('*'): if path.is_file(): try: size += path.cache.meta.size except OSError as e: log.warning(f'No cached file size. Assuming it is not tracked. {path}') return size else: log.warning(f'unknown thing at path {self.path}')
def triples_gen(prefix_func, samples): for i, sample in enumerate(samples): converter = conv.SampleConverter(sample) if 'sample_id' in sample: s_local = sample['sample_id'] else: s_local = f'local-{i + 1}' # sigh s = prefix_func(s_local) yield s, a, owl.NamedIndividual yield s, a, sparc.Sample yield from converter.triples_gen(s) continue for field, value in sample.items(): convert = getattr(converter, field, None) if convert is not None: yield (s, *convert(value)) elif field not in converter.known_skipped: log.warning(f'Unhandled sample field: {field}')
def triples(self): # FIXME ick data = self.data try: dsid = self.uri_api except BaseException as e: # FIXME ... raise e return if 'meta' in data: meta_converter = conv.MetaConverter(data['meta'], self) yield from meta_converter.triples_gen(dsid) else: log.warning( f'{self} has no meta!' ) # FIXME split logs into their problems, and our problems if 'status' not in data: breakpoint() yield from conv.StatusConverter(data['status'], self).triples_gen(dsid) #converter = conv.DatasetConverter(data) #yield from converter.triples_gen(dsid) def id_(v): s = rdflib.URIRef(dsid) yield s, a, owl.NamedIndividual yield s, a, sparc.Resource yield s, rdfs.label, rdflib.Literal( self.folder_name) # not all datasets have titles yield from id_(self.id) #for subjects in self.subjects: #for s, p, o in subjects.triples_gen(subject_id): #if type(s) == str: #breakpoint() #yield s, p, o yield from self.ddt(data) yield from self.triples_subjects yield from self.triples_samples
def xlsx(self): kwargs = { 'delimiter' : '\t', 'skip_empty_lines' : True, 'outputencoding': 'utf-8', } sheetid = 0 xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs) f = io.StringIO() try: xlsx2csv.convert(f, sheetid) f.seek(0) gen = csv.reader(f, delimiter='\t') # avoid first row sheet line next(gen) yield from gen except SheetNotFoundException as e: log.warning(f'Sheet weirdness in{self.path}') log.warning(str(e))
def check_fordd(paths, level=0, stop=3): if not paths: # apparently the empty case recurses forever return if len(paths) > self.max_childs: log.warning(f'Not globing in a folder with > {self.max_childs} children! ' f'{self.as_posix()!r}') return dd_paths_all = [] children = [] for path in paths: dd_paths = list(path.glob('[Dd]ataset_description*.*')) if dd_paths: dd_paths_all.extend(dd_paths) elif not dd_paths_all: children.extend([p for p in path.children if p.is_dir()]) if dd_paths_all: return dd_paths_all else: return check_fordd(children, level + 1)
def xlsx(self): kwargs = { 'delimiter' : '\t', 'skip_empty_lines' : True, 'outputencoding': 'utf-8', } sheetid = 1 xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs) ns = len(xlsx2csv.workbook.sheets) if ns > 1: message = f'too many sheets ({ns}) in {self.path.as_posix()!r}' self.addError(exc.EncodingError(message)) logd.error(message) f = io.StringIO() try: xlsx2csv.convert(f, sheetid) f.seek(0) gen = csv.reader(f, delimiter='\t') yield from gen except SheetNotFoundException as e: log.warning(f'Sheet weirdness in{self.path}') log.warning(str(e))
def validate_structure(path, dir_structure, subjects, samples): he = HasErrors(pipeline_stage='Derives.validate_structure') # FIXME TODO handle pools as well and figure out cases where subjects/samples are metadata only # for dataset templates of the 1.* series # general approach: set of all specimen ids and set of all # folder names take the ones that match ignore the known ok # that do not, and warn on all the rest that do not match # and that are not inside a known specimen or subject folder valid_top_123 = ( 'source', 'primary', 'derivative', # FIXME not here :/ schema somehow? 'code', 'docs', 'protocol') def top_level(drp): return drp.parent.name == '' and drp.name in valid_top_123 # absolute_paths = [path / pblob['dataset_relative_path'] for pblob in dir_structure] dd = defaultdict(list) for pblob in dir_structure: drp = pblob['dataset_relative_path'] p = drp.parts dd[p[-1]].append((len(p), drp, p[::-1])) dirs = { k: av for k, vs in dd.items() for av in ([v for v in vs if not top_level(v[1])], ) # cull empty in a single step if av } # subject_id could be missing, but we filter failures on all of # those so in theory we shouldn't need to handle it as this stage subs = {s['subject_id']: s for s in subjects} dd = defaultdict(list) for s in samples: dd[s['sample_id']].append(s) samps = dict(dd) union_sub = set(dirs) | set(subs) inter_sub = set(dirs) & set(subs) records = [] done_dirs = set() done_specs = set() if inter_sub == set(subs): for subject_id, blob in subs.items(): done_dirs.add(subject_id) done_specs.add(subject_id) records.append({ 'type': 'SubjectDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': subject_id, 'dirs': [d[1] for d in dirs[subject_id]] }) else: # FIXME not all subjects have folders there may be samples # that have folders but not subjects ??? don't wan't to force # metadata structure onto folder structure but it complicates # the implementation again ... probably worth it in this case logd.warning('miscount subject dirs, TODO') pass union_sam = set(dirs) | set(samps) inter_sam = set(dirs) & set(samps) template_version_less_than_2 = True # FIXME TODO # FIXME this is where non-uniqueness of sample ids becomes a giant pita if inter_sam == set(samps): for sample_id, blob in samps.items(): if len(blob) > 1: # FIXME TODO this means that we need to fail over to the primary keys msg = f'sample_id is not unique! {sample_id}\n{blob}' if he.addError(msg, blame='submission', path=path): logd.error(msg) continue if template_version_less_than_2: # FIXME this is sure the cause an error at some point done_dirs.add((blob[0]['subject_id'], sample_id)) done_specs.add(blob[0]['primary_key']) else: done_dirs.add(sample_id) done_specs.add(sample_id) records.append({ 'type': 'SampleDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': sample_id, 'dirs': [d[1] for d in dirs[sample_id]] }) else: logd.warning('miscount sample dirs, TODO') bad_dirs = [] if template_version_less_than_2: # handle old aweful nonsense # 1. construct subject sample lookups using tuple # 2. try to construct subject sample id pairs for sample_id, blobs in samps.items(): for blob in blobs: if sample_id in dirs: candidates = dirs[sample_id] # TODO zero candidates error actual = [] for level, drp, rparts in candidates: if level < 2: msg = ( f'Bad location for specimen folder! {drp}' ) if he.addError(msg, blame='submission', path=path): logd.error(msg) bad_dirs.append(dirs.pop(sample_id)) continue p_sample_id, p_subject_id, *p_rest = rparts if level < 3: # p_subject_id will be primary derivatie or source log.warning(f'TODO new structure {drp}') assert sample_id == p_sample_id # this should always be true subject_id = blob['subject_id'] if subject_id == p_subject_id: id = blob['primary_key'] done_dirs.add((subject_id, p_sample_id)) done_specs.add(id) actual.append(drp) if actual: records.append({ 'type': 'SampleDirs', # have to split the type because we can't recover # the type using just the specimen id (sigh) # and we need it to set the correct prefix (sigh) 'specimen_id': id, 'dirs': actual, }) else: msg = f'No folder for sample {sample_id}' if he.addError(msg, blame='submission', path=path): logd.error(msg) else: pass # TODO that's an error! usamps = set(v['primary_key'] for vs in samps.values() for v in vs) udirs = set( nv for path_name, subpaths in dirs.items() for nv in (((subpath[-1][1], path_name) for subpath in subpaths ) # -1 rpaths 1 parent # XXX FIXME clearly wrong ??? if path_name in samps else (path_name, ))) not_done_specs = (set(subs) | usamps) - set(done_specs) not_done_dirs = set(udirs) - set(done_dirs) obj = {} if records: obj['records'] = records else: pass # TODO embed an error if not_done_specs: msg = ('There are specimens that have no corresponding ' f'directory!\n{not_done_specs}') if he.addError(msg, blame='submission', path=path): logd.error(msg) if not_done_dirs: msg = ('There are directories that have no corresponding ' f'specimen!\n{not_done_dirs}') if he.addError(msg, blame='submission', path=path): logd.error(msg) he.embedErrors(obj) return obj,
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'contributor_name' in contributor and 'first_name' in contributor: name = contributor['contributor_name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) mn, _mn = mn.rstrip('.'), mn if mn != _mn: he.addError(f'Middle initials don\'t need periods :) {name!r}', logfunc=logd.error) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = idlib.Orcid(orcid) # FIXME reloading from json if isinstance(orcid, idlib.Orcid): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor) # lifting + adding if 'contributor_affiliation' in contributor: ca = contributor['contributor_affiliation'] maybe_ror = self.lifters.affiliations(ca) if maybe_ror is not None: contributor['affiliation'] = maybe_ror