Example #1
0
    def normalize(cls, value):
        _ovalue = value
        value = super().normalize(value, preserve_case=True)
        if 'OT2' in value and 'OD' not in value:
            # one is missing the OD >_<
            log.warning(value)
            value = value.replace('-', '-OD')  # hack

        n = (
            value.strip().replace('-', '-')  # can you spot the difference?
            .replace('(', '').replace(')', '').replace('-01S1', '').replace(
                '-01', '').replace('-02S2', '').replace('-02', '').replace(
                    'SPARC',
                    '').replace('NIH-1', '').replace('NIH-', '').replace(
                        '-', '').replace('NIH ', '').replace(' ', ''))
        if n[0] in ('1', '3', '5'):
            n = n[1:]

        if n.endswith('S2'):
            n = n[:-2]

        if n.endswith('D23864'):  # FIXME another trailing zero
            log.critical(_ovalue)
            n = n.replace('D23864', 'D023864')

        if n != _ovalue:
            log.debug(f'\n{_ovalue}\n{n}')
        return n
Example #2
0
    def get(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        log.debug('going to network for protocols')
        resp = requests.get(uri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
Example #3
0
    def pipeline_end(self):
        """ this part adds the meta bits we need after _with_errors
            and rearranges anything that needs to be moved about """

        try:
            data = super().pipeline_end
        except self.SkipPipelineError as e:
            data = e.data
            if 'meta' not in data:
                data['meta'] = {}

            if 'status' not in data:
                data['status'] = {}

            si = 5555
            ci = 4444
            data['status']['submission_index'] = si
            data['status']['curation_index'] = ci
            data['status']['error_index'] = si + ci
            data['status']['submission_errors'] = []
            data['status']['curation_errors'] = []
            #data['protocol_index'] = 9999  # I think this one has to go in reverse?
            log.debug('pipeline skipped to end due to errors')

        return data
Example #4
0
 def cleaned(self):
     data = self.moved
     removed = list(
         DictTransformer.pop(data, self.cleans, source_key_optional=True))
     #log.debug(f'cleaned the following values from {self}' + lj(removed))
     log.debug(f'cleaned {len(removed)} values from {self}')
     return data
Example #5
0
 def __init__(self, blackfynn_local_instance):
     log.debug('going to network for members')
     # there are other ways to get here but this one caches
     # e.g. self.organization.path.remote.bfobject
     # self.path.remote.oranization.bfobject
     # self.path.remote.bfl.organization.members
     self.__class__._bfli = blackfynn_local_instance
Example #6
0
    def _indexes(cls, data):
        """ compute submission and curation error indexes """
        errors = get_all_errors(data)
        submission_errors = []
        curation_errors = []
        for error in reversed(errors):
            if error in submission_errors or error in curation_errors:
                log.debug('error detected multiple times not counting '
                          'subsequent occurances' + lj(error))
                continue

            stage = error['pipeline_stage']
            message = error['message']
            if stage in cls._submission:
                submission_errors.append(error)
            elif stage in cls._curation:
                curation_errors.append(error)
            else:
                raise ValueError(f'Unhandled stage {stage} {message}')

        si = len(submission_errors)
        ci = len(curation_errors)
        data['status'] = {}
        data['status']['submission_index'] = si
        data['status']['curation_index'] = ci
        data['status']['error_index'] = si + ci
        data['status']['submission_errors'] = submission_errors
        data['status']['curation_errors'] = curation_errors

        return si + ci
Example #7
0
 def _protocol_uris_resolved(self):
     # FIXME quite slow ...
     for start_uri in self.protocol_uris:
         log.debug(start_uri)
         for end_uri in resolution_chain(start_uri):
             pass
         else:
             yield end_uri
Example #8
0
    def __members(self):
        if not hasattr(self.__class__, '_members'):
            log.debug('going to network for members')
            # there are other ways to get here but this one caches
            # e.g. self.organization.path.remote.bfobject
            # self.path.remote.oranization.bfobject
            # self.path.remote.bfl.organization.members
            self.__class__._members = self.path.remote.bfl.organization.members

        return self._members
Example #9
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'name' in contributor and 'first_name' in contributor:
            name = contributor['name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = OrcidId(orcid)  # FIXME reloading from json

            if isinstance(orcid, OrcidId):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)
Example #10
0
    def pipeline_start(self):
        data = super().pipeline_start
        if 'errors' in data:
            get_paths = set(tuple(gp) for gas, _, _ in self.subpipelines for gp, _ in gas)
            sections = set((s,) for s in data)
            # FIXME probably should just use adops.get to check for the path
            both = get_paths & sections
            if not both:
                log.debug(f'{get_paths}\n{sections}\n{both}')
                raise self.SkipPipelineError(data)

        return data
Example #11
0
    def _errors(self):
        for e, stage in self._errors_set:
            o = {'pipeline_stage': stage}  # FIXME

            if isinstance(e, str):
                o['message'] = e

            elif isinstance(e, BaseException):
                o['message'] = str(e)
                o['type'] = str(type(e))

            else:
                raise TypeError(repr(e))

            log.debug(o)
            yield o
Example #12
0
    def overview(self):
        if self.path.exists():
            with open(self.path, 'rb') as f:
                soup = self._BeautifulSoup(f.read(), 'lxml')
        else:
            resp = requests.get(self.url)
            soup = self._BeautifulSoup(resp.content, 'lxml')

        self.raw = {}
        self.former_to_current = {}
        for bsoup in soup.find_all(
                'div', {'id': lambda v: v and v.endswith('-bubble')}):
            organ, *_rest = bsoup['id'].split('-')
            logd.debug(_rest)
            award_list = self.raw[organ] = []
            for asoup in bsoup.find_all('a'):
                href = asoup['href']
                log.debug(href)
                parts = urlparse(href)
                query = parse_qs(parts.query)
                if 'projectnumber' in query:
                    award_list.extend(query['projectnumber'])
                elif 'aid' in query:
                    #aid = [int(a) for a in query['aid']]
                    #json = self.reporter(aid)
                    award, former = self.reporter(href)
                    award_list.append(award)
                    if former is not None:
                        award_list.append(
                            former)  # for this usecase this is ok
                        self.former_to_current[former] = award
                elif query:
                    log.debug(lj(query))

        self.former_to_current = {
            nml.NormAward(nml.NormAward(k)): nml.NormAward(nml.NormAward(v))
            for k, v in self.former_to_current.items()
        }
        self._normalized = {}
        self.normalized = {}
        for frm, to in ((self.raw, self._normalized), (self._normalized,
                                                       self.normalized)):
            for organ, awards in frm.items():
                if organ in self.organ_lookup:
                    organ = self.organ_lookup[organ].iri

                to[organ] = [nml.NormAward(a) for a in awards]
Example #13
0
    def _get_protocol_json(self, uri):
        #juri = uri + '.json'
        logd.info(
            uri.identifier if isinstance(uri, idlib.Stream) else uri)  # FIXME
        pi = idlib.get_right_id(uri)
        if 'protocols.io' in pi:
            pioid = pi.slug  # FIXME normalize before we ever get here ...
            log.info(pioid)
        else:
            msg = f'protocol uri is not from protocols.io {pi} {self.id}'
            logd.error(msg)
            self.addError(msg)
            return

        #uri_path = uri.rsplit('/', 1)[-1]
        apiuri = 'https://www.protocols.io/api/v3/protocols/' + pioid
        #'https://www.protocols.io/api/v3/groups/sparc/protocols'
        #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top'
        #print(apiuri, header)
        log.debug('going to network for protocols')
        resp = requests.get(apiuri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
Example #14
0
    def _protocol_uris_resolved(self):
        # FIXME quite slow ...
        for start_uri in self.protocol_uris:
            log.debug(start_uri)
            try:
                if not hasattr(start_uri, 'dereference'):
                    start_uri = idlib.StreamUri(start_uri)

                end_uri = start_uri.dereference()
                yield end_uri
                sc = end_uri.progenitor.status_code
                if sc > 400:
                    msg = f'error accessing {end_uri} {sc}'
                    self.addError(msg, blame='submission', logfunc=logd.error)
            except idlib.exceptions.ResolutionError as e:
                pass  # FIXME I think we already log this error?
            except requests.exceptions.MissingSchema as e:
                self.addError(e, blame='submission', logfunc=logd.error)
            except OntId.BadCurieError as e:
                self.addError(e, blame='submission', logfunc=logd.error)
            except BaseException as e:
                #breakpoint()
                log.exception(e)
                log.critical('see exception above')
Example #15
0
    def _indexes(cls, data):
        """ compute submission and curation error indexes """
        errors = get_all_errors(data)
        submission_errors = []
        curation_errors = []
        for error in reversed(errors):
            if error in submission_errors or error in curation_errors:
                log.debug('error detected multiple times not counting '
                          'subsequent occurances' + lj(error))
                continue

            if 'blame' not in error:
                breakpoint()

            blame = error['blame']
            stage = error['pipeline_stage']
            message = error['message']

            blamed = False
            if blame is not None:
                if blame in cls._blame:
                    blame_target = cls._blame[blame]
                    if blame_target == cls._blame_stage:
                        pass
                    elif blame_target == cls._blame_everyone:
                        submission_errors.append(error)
                        curation_errors.append(error)
                        blamed = True
                    elif blame_target == cls._blame_submission:
                        submission_errors.append(error)
                        blamed = True
                    elif blame_target == cls._blame_curation:
                        curation_errors.append(error)
                        blamed = True
                    else:
                        raise ValueError(f'Unhandled blame target {blame_target}\n{message}')

                else:
                    raise ValueError(f'Unhandled blame type {blame}\n{message}')

            if stage in cls._submission:
                if not blamed:
                    submission_errors.append(error)
            elif stage in cls._curation:
                if not blamed:
                    curation_errors.append(error)
            else:
                if blame not in ('pipeline', 'submission', 'debug'):
                    raise ValueError(f'Unhandled stage {stage}\n{message}')

        si = len(submission_errors)
        ci = len(curation_errors)
        if 'status' not in data:
            data['status'] = {}

        data['status']['submission_index'] = si
        data['status']['curation_index'] = ci
        data['status']['error_index'] = si + ci
        data['status']['submission_errors'] = submission_errors
        data['status']['curation_errors'] = curation_errors

        return si + ci
Example #16
0
    def _process(self, contributor):
        # get member if we can find them
        he = dat.HasErrors(pipeline_stage=self.__class__.__name__ + '.data')
        if 'contributor_name' in contributor and 'first_name' in contributor:
            name = contributor['contributor_name']
            if ';' in name:
                msg = f'Bad symbol in name {name!r}'
                he.addError(msg)
                logd.error(msg)

            fn = contributor['first_name']
            ln = contributor['last_name']
            if ' ' in fn:
                fn, mn = fn.split(' ', 1)
                mn, _mn = mn.rstrip('.'), mn
                if mn != _mn:
                    he.addError(f'Middle initials don\'t need periods :) {name!r}',
                                logfunc=logd.error)
                contributor['middle_name'] = mn
                contributor['first_name'] = fn

            if ' ' in ln:
                msg = f'Malformed last_name {ln!r}'
                he.addError(msg)
                logd.error(msg)
                ln = ln.replace(' ', '-')

            failover = f'{fn}-{ln}'
            member = self.member(fn, ln)

            if member is not None:
                userid = OntId('https://api.blackfynn.io/users/' + member.id)
                contributor['blackfynn_user_id'] = userid

        else:
            member = None
            failover = 'no-orcid-no-name'
            log.warning(f'No name!' + lj(contributor))

        orcid = None
        if 'contributor_orcid_id' in contributor:
            orcid = contributor['contributor_orcid_id']
            if type(orcid) == str and 'orcid.org' in orcid:
                orcid = idlib.Orcid(orcid)  # FIXME reloading from json

            if isinstance(orcid, idlib.Orcid):
                s = orcid
            else:  # it's not an orcid or its a bad orcid
                orcid = None

        if orcid is None:
            if member is not None:
                s = userid
            else:
                log.debug(lj(contributor))
                s = OntId(self.dsid + '/contributors/' + failover)

        contributor['id'] = s
        he.embedErrors(contributor)

        # lifting + adding
        if 'contributor_affiliation' in contributor:
            ca = contributor['contributor_affiliation']
            maybe_ror = self.lifters.affiliations(ca)
            if maybe_ror is not None:
                contributor['affiliation'] = maybe_ror
Example #17
0
 def fix(mcd):
     log.debug(mcd)
     return mcd