def normalize(cls, value): _ovalue = value value = super().normalize(value, preserve_case=True) if 'OT2' in value and 'OD' not in value: # one is missing the OD >_< log.warning(value) value = value.replace('-', '-OD') # hack n = ( value.strip().replace('-', '-') # can you spot the difference? .replace('(', '').replace(')', '').replace('-01S1', '').replace( '-01', '').replace('-02S2', '').replace('-02', '').replace( 'SPARC', '').replace('NIH-1', '').replace('NIH-', '').replace( '-', '').replace('NIH ', '').replace(' ', '')) if n[0] in ('1', '3', '5'): n = n[1:] if n.endswith('S2'): n = n[:-2] if n.endswith('D23864'): # FIXME another trailing zero log.critical(_ovalue) n = n.replace('D23864', 'D023864') if n != _ovalue: log.debug(f'\n{_ovalue}\n{n}') return n
def get(self, uri): #juri = uri + '.json' logd.info(uri) log.debug('going to network for protocols') resp = requests.get(uri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def pipeline_end(self): """ this part adds the meta bits we need after _with_errors and rearranges anything that needs to be moved about """ try: data = super().pipeline_end except self.SkipPipelineError as e: data = e.data if 'meta' not in data: data['meta'] = {} if 'status' not in data: data['status'] = {} si = 5555 ci = 4444 data['status']['submission_index'] = si data['status']['curation_index'] = ci data['status']['error_index'] = si + ci data['status']['submission_errors'] = [] data['status']['curation_errors'] = [] #data['protocol_index'] = 9999 # I think this one has to go in reverse? log.debug('pipeline skipped to end due to errors') return data
def cleaned(self): data = self.moved removed = list( DictTransformer.pop(data, self.cleans, source_key_optional=True)) #log.debug(f'cleaned the following values from {self}' + lj(removed)) log.debug(f'cleaned {len(removed)} values from {self}') return data
def __init__(self, blackfynn_local_instance): log.debug('going to network for members') # there are other ways to get here but this one caches # e.g. self.organization.path.remote.bfobject # self.path.remote.oranization.bfobject # self.path.remote.bfl.organization.members self.__class__._bfli = blackfynn_local_instance
def _indexes(cls, data): """ compute submission and curation error indexes """ errors = get_all_errors(data) submission_errors = [] curation_errors = [] for error in reversed(errors): if error in submission_errors or error in curation_errors: log.debug('error detected multiple times not counting ' 'subsequent occurances' + lj(error)) continue stage = error['pipeline_stage'] message = error['message'] if stage in cls._submission: submission_errors.append(error) elif stage in cls._curation: curation_errors.append(error) else: raise ValueError(f'Unhandled stage {stage} {message}') si = len(submission_errors) ci = len(curation_errors) data['status'] = {} data['status']['submission_index'] = si data['status']['curation_index'] = ci data['status']['error_index'] = si + ci data['status']['submission_errors'] = submission_errors data['status']['curation_errors'] = curation_errors return si + ci
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) for end_uri in resolution_chain(start_uri): pass else: yield end_uri
def __members(self): if not hasattr(self.__class__, '_members'): log.debug('going to network for members') # there are other ways to get here but this one caches # e.g. self.organization.path.remote.bfobject # self.path.remote.oranization.bfobject # self.path.remote.bfl.organization.members self.__class__._members = self.path.remote.bfl.organization.members return self._members
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'name' in contributor and 'first_name' in contributor: name = contributor['name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = OrcidId(orcid) # FIXME reloading from json if isinstance(orcid, OrcidId): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor)
def pipeline_start(self): data = super().pipeline_start if 'errors' in data: get_paths = set(tuple(gp) for gas, _, _ in self.subpipelines for gp, _ in gas) sections = set((s,) for s in data) # FIXME probably should just use adops.get to check for the path both = get_paths & sections if not both: log.debug(f'{get_paths}\n{sections}\n{both}') raise self.SkipPipelineError(data) return data
def _errors(self): for e, stage in self._errors_set: o = {'pipeline_stage': stage} # FIXME if isinstance(e, str): o['message'] = e elif isinstance(e, BaseException): o['message'] = str(e) o['type'] = str(type(e)) else: raise TypeError(repr(e)) log.debug(o) yield o
def overview(self): if self.path.exists(): with open(self.path, 'rb') as f: soup = self._BeautifulSoup(f.read(), 'lxml') else: resp = requests.get(self.url) soup = self._BeautifulSoup(resp.content, 'lxml') self.raw = {} self.former_to_current = {} for bsoup in soup.find_all( 'div', {'id': lambda v: v and v.endswith('-bubble')}): organ, *_rest = bsoup['id'].split('-') logd.debug(_rest) award_list = self.raw[organ] = [] for asoup in bsoup.find_all('a'): href = asoup['href'] log.debug(href) parts = urlparse(href) query = parse_qs(parts.query) if 'projectnumber' in query: award_list.extend(query['projectnumber']) elif 'aid' in query: #aid = [int(a) for a in query['aid']] #json = self.reporter(aid) award, former = self.reporter(href) award_list.append(award) if former is not None: award_list.append( former) # for this usecase this is ok self.former_to_current[former] = award elif query: log.debug(lj(query)) self.former_to_current = { nml.NormAward(nml.NormAward(k)): nml.NormAward(nml.NormAward(v)) for k, v in self.former_to_current.items() } self._normalized = {} self.normalized = {} for frm, to in ((self.raw, self._normalized), (self._normalized, self.normalized)): for organ, awards in frm.items(): if organ in self.organ_lookup: organ = self.organ_lookup[organ].iri to[organ] = [nml.NormAward(a) for a in awards]
def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info( uri.identifier if isinstance(uri, idlib.Stream) else uri) # FIXME pi = idlib.get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://www.protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' self.addError(msg, blame='submission', logfunc=logd.error) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except requests.exceptions.MissingSchema as e: self.addError(e, blame='submission', logfunc=logd.error) except OntId.BadCurieError as e: self.addError(e, blame='submission', logfunc=logd.error) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def _indexes(cls, data): """ compute submission and curation error indexes """ errors = get_all_errors(data) submission_errors = [] curation_errors = [] for error in reversed(errors): if error in submission_errors or error in curation_errors: log.debug('error detected multiple times not counting ' 'subsequent occurances' + lj(error)) continue if 'blame' not in error: breakpoint() blame = error['blame'] stage = error['pipeline_stage'] message = error['message'] blamed = False if blame is not None: if blame in cls._blame: blame_target = cls._blame[blame] if blame_target == cls._blame_stage: pass elif blame_target == cls._blame_everyone: submission_errors.append(error) curation_errors.append(error) blamed = True elif blame_target == cls._blame_submission: submission_errors.append(error) blamed = True elif blame_target == cls._blame_curation: curation_errors.append(error) blamed = True else: raise ValueError(f'Unhandled blame target {blame_target}\n{message}') else: raise ValueError(f'Unhandled blame type {blame}\n{message}') if stage in cls._submission: if not blamed: submission_errors.append(error) elif stage in cls._curation: if not blamed: curation_errors.append(error) else: if blame not in ('pipeline', 'submission', 'debug'): raise ValueError(f'Unhandled stage {stage}\n{message}') si = len(submission_errors) ci = len(curation_errors) if 'status' not in data: data['status'] = {} data['status']['submission_index'] = si data['status']['curation_index'] = ci data['status']['error_index'] = si + ci data['status']['submission_errors'] = submission_errors data['status']['curation_errors'] = curation_errors return si + ci
def _process(self, contributor): # get member if we can find them he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data') if 'contributor_name' in contributor and 'first_name' in contributor: name = contributor['contributor_name'] if ';' in name: msg = f'Bad symbol in name {name!r}' he.addError(msg) logd.error(msg) fn = contributor['first_name'] ln = contributor['last_name'] if ' ' in fn: fn, mn = fn.split(' ', 1) mn, _mn = mn.rstrip('.'), mn if mn != _mn: he.addError(f'Middle initials don\'t need periods :) {name!r}', logfunc=logd.error) contributor['middle_name'] = mn contributor['first_name'] = fn if ' ' in ln: msg = f'Malformed last_name {ln!r}' he.addError(msg) logd.error(msg) ln = ln.replace(' ', '-') failover = f'{fn}-{ln}' member = self.member(fn, ln) if member is not None: userid = OntId('https://api.blackfynn.io/users/' + member.id) contributor['blackfynn_user_id'] = userid else: member = None failover = 'no-orcid-no-name' log.warning(f'No name!' + lj(contributor)) orcid = None if 'contributor_orcid_id' in contributor: orcid = contributor['contributor_orcid_id'] if type(orcid) == str and 'orcid.org' in orcid: orcid = idlib.Orcid(orcid) # FIXME reloading from json if isinstance(orcid, idlib.Orcid): s = orcid else: # it's not an orcid or its a bad orcid orcid = None if orcid is None: if member is not None: s = userid else: log.debug(lj(contributor)) s = OntId(self.dsid + '/contributors/' + failover) contributor['id'] = s he.embedErrors(contributor) # lifting + adding if 'contributor_affiliation' in contributor: ca = contributor['contributor_affiliation'] maybe_ror = self.lifters.affiliations(ca) if maybe_ror is not None: contributor['affiliation'] = maybe_ror
def fix(mcd): log.debug(mcd) return mcd