def __init__(self, path): super().__init__() self.path = path tabular = Tabular(self.path) self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys) self.t = tabular l = list(tabular) if not l: # FIXME bad design, this try block is a workaround for bad handling of empty lists raise exc.NoDataError(self.path) self.orig_header, *rest = l header = Header(self.orig_header).data self.fail = False if self.to_index: for head in self.to_index: if head not in header: log.error(f'\'{self.t.path}\' malformed header!') self.fail = True if self.fail: self.bc = byCol(rest, header) else: self.bc = byCol(rest, header, to_index=self.to_index)
def normalize(self, key, value): v = value.replace('\ufeff', '') # FIXME utf-16 issue if v != value: # TODO can we decouple encoding from value normalization? message = f"encoding feff error in '{self.path}'" log.error(message) self.addError(exc.EncodingError(message)) if v.lower().strip() not in ('n/a', 'na', 'no'): # FIXME explicit null vs remove from structure yield from getattr(self, key, self.default)(v)
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() else: cls.organs_sheet = sheets.Organs() # ipv6 resolution issues :/ cls.affiliations = sheets.Affiliations() if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None
def __init__(self, previous_pipeline, lifters, runtime_context): if hasattr(State, 'member'): self.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') self.member = lambda first, last: None self.contributors = previous_pipeline.data self.runtime_context = runtime_context self.dataset_id = runtime_context.id self.dsid = runtime_context.uri_api # FIXME need a BlackfynnId class self.lifters = lifters
def counts(self): if not hasattr(self, '_counts'): size = 0 dirs = 0 files = 0 need_meta = [] if not self.is_dir(): gen = self, else: gen = self.rchildren for c in gen: if c.is_dir(): dirs += 1 else: files += 1 # testing for broken symlinks is hard try: maybe_size = c.cache.meta.size except AttributeError as e: log.error(f'no cache or no meta for {c}\n{e}') continue if maybe_size is None: need_meta.append(c) else: size += maybe_size if need_meta and self._refresh_on_missing: nl = '\n' log.info( f'refreshing {len(need_meta)} files with missing metadata in {self}' f'\n{nl.join(_.as_posix() for _ in need_meta)}') new_caches = Async(rate=self.rate)(deferred(c.cache.refresh)() for c in need_meta) for c in new_caches: # FIXME first time around meta doesn't get updated ?? if c is None: continue # file was deleted (logged previously) if c.meta is None: log.critical(f'missing metdata! {c}') continue size += c.meta.size self._counts = dict(size=FileSize(size), dirs=dirs, files=files) return self._counts
def _param(self, value): try: pv = pyru.UnitsParser(value).asPython() except pyru.UnitsParser.ParseFailure as e: caller_name = e.__traceback__.tb_frame.f_back.f_code.co_name msg = f'Unexpected and unhandled value {value} for {caller_name}' log.error(msg) self.addError(msg, pipeline_stage=self.__class__.__name__ + '.curation-error') return value #if not pv[0] == 'param:parse-failure': if pv is not None: # parser failure # FIXME check on this ... yield pv # this one needs to be a string since it is combined below else: # TODO warn yield value
def _protcur(self, protocol_uri, filter=lambda p: True): self.lazy_setup() protocol_uri = get_right_id(protocol_uri) gen = (p for p in protc if p.uri.startswith(protocol_uri) and filter(p)) try: p = next(gen) yield p yield from gen except StopIteration: log.error(f'could not find annotations for {protocol_uri}') return if p.document.otherVersionUri: # FIXME also maybe check /abstract? other_uri = p.document.otherVersionUri yield from (p for p in protc if p.uri.startswith(other_uri) and filter(p))
def __init__(self, path): super().__init__() self.path = path if self._is_json: with open(self.path, 'rt') as f: try: self._data_raw = json.load(f) except json.decoder.JSONDecodeError as e: if not f.buffer.tell(): raise exc.NoDataError(self.path) else: raise exc.BadDataError(self.path) from e if isinstance(self._data_raw, dict): # FIXME this breaks downstream assumptions self._data_cache = {self.rename_key(k):tos(self.normalize(k, v)) # FIXME FIXME for k, v in self._data_raw.items()} return tabular = Tabular(self.path) self.skip_rows = tuple(key for keys in self.verticals.values() for key in keys) self.t = tabular l = list(tabular) if not l: # FIXME bad design, this try block is a workaround for bad handling of empty lists raise exc.NoDataError(self.path) self.orig_header, *rest = l header = Header(self.orig_header).data self.fail = False if self.to_index: for head in self.to_index: if head not in header: log.error(f'\'{self.t.path}\' malformed header!') self.fail = True if self.fail: try: self.bc = byCol(rest, header) except ValueError as e: raise exc.BadDataError(self.path) from e else: self.bc = byCol(rest, header, to_index=self.to_index)
def triples_contributors(self, contributor, creator=False): try: dsid = self.dsid # FIXME json reload needs to deal with this except BaseException as e: # FIXME ... log.error(e) return s = rdflib.URIRef(contributor['id']) # FIXME json reload needs to deal with this if 'blackfynn_user_id' in contributor: userid = rdflib.URIRef(contributor['blackfynn_user_id']) yield s, TEMP.hasBlackfynnUserId, userid yield s, a, owl.NamedIndividual yield s, a, sparc.Researcher yield s, TEMP.contributorTo, dsid converter = conv.ContributorConverter(contributor) yield from converter.triples_gen(s) if creator: yield s, TEMP.creatorOf, dsid
def subpipeline_errors(self, errors): """ override this for pipeline specific error handling rules """ for path, error, subpipeline_class in errors: log.error(f'{path}\n{error}\n{subpipeline_class}\n{self!r}')
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return class FakeOverviewSheet: def __call__(self, *args, **kwargs): return # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() cls.overview_sheet = FakeOverviewSheet() else: # ipv6 resolution issues :/ also issues with pickling #cls.organs_sheet = sheets.Organs(fetch_grid=True) # this kills parallelism cls.organs_sheet = sheets.Organs( ) # if fetch_grid = False @ class level ok cls.affiliations = sheets.Affiliations() cls.overview_sheet = sheets.Overview() # zap all the services (apparently doesn't help) # yep, its just the organ sheet, these go in and out just fine #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service') #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro') #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, '_spreadsheet_service'): #delattr(s, '_spreadsheet_service') # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG! #cls.organs_sheet = FakeOrganSheet # organs is BAD #cls.affiliations = FakeAffilSheet() # affiliations is OK #cls.overview_sheet = FakeOverviewSheet() # overview is OK #breakpoint() # remove byCol which is unpickleable (super duper sigh) #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, 'byCol'): #delattr(s, 'byCol') if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None
def __iter__(self): try: yield from self.normalize(getattr(self, self.file_extension)()) except UnicodeDecodeError as e: log.error(f'{self.path.as_posix()!r} {e}')