Ejemplo n.º 1
0
    def refresh(self,
                update_cache=False,
                update_data=False,
                update_data_on_cache=False,
                size_limit_mb=2,
                force=False):
        """ use force if you have a file from packages """
        try:
            old_meta = self.meta
        except exc.NoMetadataRetrievedError as e:
            log.error(
                f'{e}\nYou will need to individually refresh {self.local}')
            return
        except exc.NoRemoteFileWithThatIdError as e:
            log.exception(e)
            return

        if self.is_file() and not force:  # this will tigger a fetch
            pass
        else:
            self._bfobject = self._api.get(self.id)

        if update_cache or update_data:
            file_is_different = self.update_cache()
            update_existing = file_is_different and self.cache.exists()
            udoc = update_data_on_cache and file_is_different
            if update_existing or udoc:
                size_limit_mb = None

            update_data = update_data or update_existing or udoc

        if update_data and self.is_file():
            self.cache.fetch(size_limit_mb=size_limit_mb)

        return self.cache  # when a cache calls refresh it needs to know if it no longer exists
Ejemplo n.º 2
0
    def condense(self):
        marked_as_done = '-'
        mapping = defaultdict(list)

        def make_key(row):
            return tuple(
                c.value for c in
                [row.tag(), row.value(),
                 row.text(), row.exact()])

        create = []
        for mt_cell in self.row_object(0).map_to().column.cells[1:]:
            if (mt_cell.value and mt_cell.value != marked_as_done
                    and mt_cell.value != mt_cell.row.value().value):
                try:
                    row, iv = self._row_from_index(value=mt_cell.value)
                    key = make_key(row)
                except AttributeError as e:  # value not in index
                    key = ('protc:executor-verb', mt_cell.value, '', '')
                    if key not in create:
                        create.append(key)
                        log.exception(e)

                mapping[key].append(
                    mt_cell.row.value().value)  # cells don't move so we're ok

        mapping = dict(mapping)
        value_to_map_to = {
            value: k
            for k, values in mapping.items() for value in values
        }

        #breakpoint()
        return value_to_map_to, create  # old -> new, original -> correct
Ejemplo n.º 3
0
    def _protocol_uris_resolved(self):
        # FIXME quite slow ...
        for start_uri in self.protocol_uris:
            log.debug(start_uri)
            try:
                if not hasattr(start_uri, 'dereference'):
                    start_uri = idlib.StreamUri(start_uri)

                end_uri = start_uri.dereference()
                yield end_uri
                sc = end_uri.progenitor.status_code
                if sc > 400:
                    msg = f'error accessing {end_uri} {sc}'
                    if self.addError(msg, blame='submission'):
                        logd.error(msg)

            except idlib.exceptions.ResolutionError as e:
                pass  # FIXME I think we already log this error?
            except self._MissingSchema as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except OntId.BadCurieError as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except BaseException as e:
                #breakpoint()
                log.exception(e)
                log.critical('see exception above')
Ejemplo n.º 4
0
    def get(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        log.debug('going to network for protocols')
        resp = requests.get(uri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
Ejemplo n.º 5
0
def ret_val_exp(dataset_id, updated, time_now):
    log.info(f'START {dataset_id}')
    did = PennsieveId(dataset_id)
    uid = 'updated-' + dataset_id
    fid = 'failed-' + dataset_id

    # FIXME detect cases where we have already pulled the latest and don't pull again
    # FIXME TODO smart retrieve so we don't pull if we failed during
    # export instead of pull, should be able to get it from the
    # cached metadata on the dataset

    # FIXME getting file exists errors for pull in here
    # in upstream.mkdir()

    # FIXME we need to track/check the state here too in the event
    # that retrieve succeeds but validate or export fails
    # FIXME getting no paths to fetch errors

    # FIXME detect cases where it appears that a new dataset is in the process of being
    # uploaded and don't run for a while if it is being continually modified
    try:
        try:
            p1 = subprocess.Popen(argv_simple_retrieve(dataset_id))
            out1 = p1.communicate()
            if p1.returncode != 0:
                raise Exception(f'oops return code was {p1.returncode}')
        except KeyboardInterrupt as e:
            p1.send_signal(signal.SIGINT)
            raise e

        dataset_path = (path_source_dir / did.uuid / 'dataset').resolve()
        try:
            p2 = subprocess.Popen(argv_spc_find_meta, cwd=dataset_path)
            out2 = p2.communicate()
            if p2.returncode != 0:
                raise Exception(f'oops return code was {p2.returncode}')
        except KeyboardInterrupt as e:
            p2.send_signal(signal.SIGINT)
            raise e

        try:
            p3 = subprocess.Popen(argv_spc_export, cwd=dataset_path)
            out3 = p3.communicate()
            if p3.returncode != 0:
                raise Exception(f'oops return code was {p3.returncode}')
        except KeyboardInterrupt as e:
            p3.send_signal(signal.SIGINT)
            raise e

        conn.set(uid, updated)
        conn.delete(fid)
        log.info(f'DONE: u: {uid} {updated}')
    except Exception as e:
        log.critical(f'FAIL: {fid} {updated}')
        conn.set(fid, updated)
        log.exception(e)
Ejemplo n.º 6
0
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs):
    try:
        dict_literal = _json_identifier_expansion(obj)
    except idlib.exc.RemoteError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}

    except idlib.exc.ResolutionError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        oops = json_export_type_converter(obj)
        msg = (f'{stage.lifters.id} could not resolve '  # FIXME lifters sigh
               f'{type(obj)}: {oops} {obj.asUri()}')
        error = dict(error=msg,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                logd.error(msg)
        else:
            return {'errors': [error]}
    except Exception as e:
        log.critical(f'Unhandled exception {e} in {path}')
        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='stage',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}
Ejemplo n.º 7
0
    def decode(self, field, value):
        if field in ('created',
                     'updated'):  # FIXME human readable vs integer :/
            try:
                # needed for legacy cases
                value, = struct.unpack('d', value)
                return datetime.fromtimestamp(value)
            except struct.error:
                pass

            return parser.parse(
                value.decode())  # FIXME with timezone vs without ...

        elif field == 'checksum':
            return value

        elif field == 'etag':
            # struct pack this sucker so the count can fit as well?
            value = value.decode()  # FIXME
            checksum, strcount = value.rsplit('-', 1)
            count = int(strcount)
            return bytes.fromhex(checksum), count

        elif field == 'errors':
            value = value.decode()
            return tuple(_ for _ in value.split(';') if _)

        elif field == 'user_id':
            try:
                return int(value)
            except ValueError:  # FIXME :/ uid vs owner_id etc ...
                return value.decode()

        elif field in ('id', 'mode', 'old_id'):
            return value.decode()

        elif field not in self.fields:
            log.warning(f'Unhandled field {field}')
            return value

        else:
            try:
                return int(value)
            except ValueError as e:
                log.exception(f'{field} {value}')
                raise e
Ejemplo n.º 8
0
    def _get_protocol_json(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        pi = get_right_id(uri)
        if 'protocols.io' in pi:
            pioid = pi.slug  # FIXME normalize before we ever get here ...
            log.info(pioid)
        else:
            msg = f'protocol uri is not from protocols.io {pi} {self.id}'
            logd.error(msg)
            self.addError(msg)
            return

        #uri_path = uri.rsplit('/', 1)[-1]
        apiuri = 'https://protocols.io/api/v3/protocols/' + pioid
        #'https://www.protocols.io/api/v3/groups/sparc/protocols'
        #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top'
        #print(apiuri, header)
        log.debug('going to network for protocols')
        resp = requests.get(apiuri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
Ejemplo n.º 9
0
    def __init__(self, *args, **kwargs):
        self._cache_path = auth.get_path('cache-path') / 'google_sheets'
        if not self._only_cache:
            try:
                if 'readonly' not in kwargs or kwargs['readonly']:
                    # readonly=True is default so we take this branch if not set
                    self._saf = auth.get_path(
                        'google-api-service-account-file-readonly')
                else:
                    self._saf = auth.get_path(
                        'google-api-service-account-file-rw')
            except KeyError as e:
                log.warn(e)
            except Exception as e:
                log.exception(e)

        try:
            super().__init__(*args, **kwargs)
        finally:
            self._saf = None
Ejemplo n.º 10
0
    def triples_objects_multi(self):
        for key in self.objects_multi:
            if key in self.blob:
                values = self.blob[key]
                assert not isinstance(values, str), f'{values} in {key}'
                for value in values:
                    if key == 'external':
                        try:
                            o = OntId(value).URIRef
                            yield o, readable.annotates, self.s
                        except OntId.UnknownPrefixError as e:
                            log.exception(e)
                            continue
                    elif key == 'inheritedExternal':
                        try:
                            o = OntId(value).URIRef
                        except OntId.UnknownPrefixError as e:
                            log.exception(e)
                            continue
                    else:
                        value = value.replace(' ', '-')  # FIXME require no spaces in internal ids
                        o = self.context[value]

                    yield self.s, readable[key], o
Ejemplo n.º 11
0
def datame(d,
           ca,
           timestamp,
           helpers=None,
           log_level=logging.INFO,
           dp=_p,
           evil=[False],
           dumb=False):
    """ sigh, pickles """
    log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery',
                 'augpathlib', 'pyontutils')
    for log_name in log_names:
        log = logging.getLogger(log_name)
        if not log.handlers:
            log = makeSimpleLogger(log_name)
            log.setLevel(log_level)
            log.info(f'{log_name} had no handler')
        else:
            if log.level != log_level:
                log.setLevel(log_level)

    rc = d.path._cache_class._remote_class
    if not hasattr(rc, '_cache_anchor'):
        rc._setup()
        rc.anchorTo(ca)

    if not hasattr(BlackfynnCache, '_anchor'):
        # the fact that we only needed this much later in time
        # tells me that we had actually done an excellent job
        # of firewalling the validation pipeline from anything
        # related to the cache beyond the xatter data

        # can't use ca.__class__ because it is the posix variant of # _cache_class
        BlackfynnCache._anchor = ca

    prp = d.path.project_relative_path
    if helpers is not None:
        d.add_helpers(helpers)

    out_path = (dp / d.id).with_suffix('.json')
    if out_path.exists() and dumb:
        if not evil[0]:  # FIXME this is SO DUMB to do in here, but ...
            from pysercomb.pyr import units as pyru
            [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)]
            pyru.Term._OntTerm = OntTerm  # the tangled web grows ever deeper :x
            evil[0] = True

        log.warning(f'loading from path {out_path}')
        # FIXME this is _idiotically_ slow with joblib
        # multiple orders of magnitude faster just using listcomp
        with open(out_path, 'rt') as f:
            return fromJson(json.load(f))

    blob_dataset = d.data_for_export(timestamp)
    with open(out_path.with_suffix('.raw.json'),
              'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
        json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode)

    try:
        pipe = pipes.IrToExportJsonPipeline(
            blob_dataset)  # FIXME network sandbox violation
        blob_export = pipe.data
        with open(out_path, 'wt') as f:  # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX
            json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode)
    except Exception as e:
        log.exception(e)
        log.critical(f'error during fancy json export, see previous log entry')

    return blob_dataset