def setup(cls, blackfynn_local_instance): """ make sure we have all datasources calling this again will refresh helpers """ for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate # unanchored helpers if cls.no_google: log.critical('no google no organ data') class asdf: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) cls.organs_sheet = asdf else: cls.organs_sheet = sheets.Organs() # ipv6 resolution issues :/ cls.organ = OrganData() cls.member = State.member
def normalize(cls, value): _ovalue = value value = super().normalize(value, preserve_case=True) if 'OT2' in value and 'OD' not in value: # one is missing the OD >_< log.warning(value) value = value.replace('-', '-OD') # hack n = ( value.strip().replace('-', '-') # can you spot the difference? .replace('(', '').replace(')', '').replace('-01S1', '').replace( '-01', '').replace('-02S2', '').replace('-02', '').replace( 'SPARC', '').replace('NIH-1', '').replace('NIH-', '').replace( '-', '').replace('NIH ', '').replace(' ', '')) if n[0] in ('1', '3', '5'): n = n[1:] if n.endswith('S2'): n = n[:-2] if n.endswith('D23864'): # FIXME another trailing zero log.critical(_ovalue) n = n.replace('D23864', 'D023864') if n != _ovalue: log.debug(f'\n{_ovalue}\n{n}') return n
def warn(triple): for element in triple: if (not (isinstance(element, rdflib.URIRef) or isinstance(element, rdflib.BNode) or isinstance(element, rdflib.Literal)) or (hasattr(element, '_value') and isinstance(element._value, dict))): log.critical(element) return triple
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() else: cls.organs_sheet = sheets.Organs() # ipv6 resolution issues :/ cls.affiliations = sheets.Affiliations() if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None
def lookup(d, one, two): if one in d: ind = d[one] if two in ind: member_list = ind[two] if member_list: member = member_list[0] if len(member_list) > 1: log.critical(f'WE NEED ORCIDS! {one} {two} -> {member_list}') # organization maybe? # or better, check by dataset? return member
def counts(self): if not hasattr(self, '_counts'): size = 0 dirs = 0 files = 0 need_meta = [] if not self.is_dir(): gen = self, else: gen = self.rchildren for c in gen: if c.is_dir(): dirs += 1 else: files += 1 # testing for broken symlinks is hard try: maybe_size = c.cache.meta.size except AttributeError as e: log.error(f'no cache or no meta for {c}\n{e}') continue if maybe_size is None: need_meta.append(c) else: size += maybe_size if need_meta and self._refresh_on_missing: nl = '\n' log.info( f'refreshing {len(need_meta)} files with missing metadata in {self}' f'\n{nl.join(_.as_posix() for _ in need_meta)}') new_caches = Async(rate=self.rate)(deferred(c.cache.refresh)() for c in need_meta) for c in new_caches: # FIXME first time around meta doesn't get updated ?? if c is None: continue # file was deleted (logged previously) if c.meta is None: log.critical(f'missing metdata! {c}') continue size += c.meta.size self._counts = dict(size=FileSize(size), dirs=dirs, files=files) return self._counts
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' self.addError(msg, blame='submission', logfunc=logd.error) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except requests.exceptions.MissingSchema as e: self.addError(e, blame='submission', logfunc=logd.error) except OntId.BadCurieError as e: self.addError(e, blame='submission', logfunc=logd.error) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return class FakeOverviewSheet: def __call__(self, *args, **kwargs): return # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() cls.overview_sheet = FakeOverviewSheet() else: # ipv6 resolution issues :/ also issues with pickling #cls.organs_sheet = sheets.Organs(fetch_grid=True) # this kills parallelism cls.organs_sheet = sheets.Organs( ) # if fetch_grid = False @ class level ok cls.affiliations = sheets.Affiliations() cls.overview_sheet = sheets.Overview() # zap all the services (apparently doesn't help) # yep, its just the organ sheet, these go in and out just fine #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service') #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro') #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, '_spreadsheet_service'): #delattr(s, '_spreadsheet_service') # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG! #cls.organs_sheet = FakeOrganSheet # organs is BAD #cls.affiliations = FakeAffilSheet() # affiliations is OK #cls.overview_sheet = FakeOverviewSheet() # overview is OK #breakpoint() # remove byCol which is unpickleable (super duper sigh) #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, 'byCol'): #delattr(s, 'byCol') if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None