Ejemplo n.º 1
0
def tobn(gen, published):
    """convert hypothesis ids to blank nodes so that values serialize locally"""
    for s, p, o in gen:
        ns = fix(s)
        no = fix(o)
        if p == TEMP.protcurChildren:
            yield ns, p, no
        elif s != ns:
            yield ns, p, o
            yield ns, ilxtr.hasId, s
            yield ns, TEMP.hasUriHumanContext, rdflib.URIRef(s.replace(ph_prefix, 'https://hyp.is/'))
        else:
            yield s, p, o

        if o == sparc.Protocol:
            try:
                pid = idlib.Pio(s)
                os = pio_onts[pid.identifier.suffix]
                yield os, rdf.type, owl.Ontology
                yield os, TEMP.hasUriApi, s
                for _s in (s, os):
                    yield _s, TEMP.hasUriHuman, pid.uri_human.asType(rdflib.URIRef)
                    doi = pid.doi
                    if doi is not None:
                        yield _s, TEMP.hasDoi, pid.doi.asType(rdflib.URIRef)
                    if s in published:
                        yield _s, TEMP.datasetPublishedDoi, published[s]
            except (idlib.exc.NotAuthorizedError) as e:
                tn = GetTimeNow()
                yield s, errorns.NotAuthorized, rdflib.Literal(tn._start_time_local)
            except (idlib.exc.IdDoesNotExistError) as e:
                tn = GetTimeNow()
                yield s, errorns.IdDoesNotExist, rdflib.Literal(tn._start_time_local)
            except (idlib.exc.MalformedIdentifierError) as e:
                pass
Ejemplo n.º 2
0
    def test_1_mkdir_remote_will_be_collection(self):
        now = GetTimeNow()
        local = self.project_path / f'test-dataset-{now.START_TIMESTAMP_LOCAL_FRIENDLY}' / 'some-folder'
        remote = local.mkdir_remote(parents=True)
        parent = remote.parent
        try:
            parent.rmdir()  # should fail here
            try:
                remote.rmdir()  # insurance
            except BaseException as e:
                log.exception(e)
            finally:
                raise AssertionError(
                    'remote parent should NOT have rmdired {parent}')
        except exc.PathNotEmptyError:
            pass

        try:
            remote.rmdir()
            remote.cache.refresh()
            assert not local.exists(), f'should have been deleted {remote}'
        finally:
            lparent = parent.local
            parent.cache.refresh(
            )  # we just removed the child so the parent is stale
            parent.rmdir()
            parent.cache.refresh(
            )  # and THIS is the error we have been trying to handle all night!
            assert not lparent.exists(), f'should have been deleted {parent}'
Ejemplo n.º 3
0
 def test_0_mkdir_remote_will_be_dataset(self):
     now = GetTimeNow()
     local = self.project_path / f'test-dataset-{now.START_TIMESTAMP_LOCAL_FRIENDLY}'
     remote = local.mkdir_remote()
     remote.rmdir()
     remote.cache.refresh(
     )  # reminder that remotes are a snapshot in time, NOT dynamic
     assert not local.exists(), f'should have been deleted {remote}'
Ejemplo n.º 4
0
 def test_mkdir_remote_parents_false(self):
     now = GetTimeNow()
     local = self.project_path / f'test-dataset-{now.START_TIMESTAMP_LOCAL_FRIENDLY}' / 'some-folder'
     try:
         remote = local.mkdir_remote()
         raise AssertionError('Should have failed since parents=False')
     except FileNotFoundError:
         pass
Ejemplo n.º 5
0
    def pull(
        self,
        *args,
        paths=None,
        time_now=None,
        debug=False,
        n_jobs=12,
        cache_anchor=None,
        log_name=None,
        log_level='INFO',
        # pass in Parallel in at run time if needed
        Parallel=None,
        delayed=None,
        _in_parallel=False,
        exclude_uploaded=True,
    ):
        # TODO usage errors

        if time_now is None:
            time_now = GetTimeNow()
            log.debug('No time provided to pull so using '
                      f'{time_now.START_TIMESTAMP}')

        if _in_parallel:
            _log = logging.getLogger(log_name)
            _log.setLevel(log_level)
            rc = self._remote_class
            if not hasattr(rc, '_cache_anchor'):
                rc.anchorTo(cache_anchor)

        else:
            _log = log

        cache = self.cache

        if cache.is_organization():
            if debug or Parallel is None or n_jobs <= 1:
                for child in self.children:
                    if paths is None or child in paths:
                        child.pull()
            else:
                Parallel(n_jobs=n_jobs)(delayed(child.pull)(
                    _in_parallel=True,
                    time_now=time_now,
                    cache_anchor=cache.anchor,
                    log_name=_log.name,
                    log_level=log_level,
                    exclude_uploaded=exclude_uploaded,
                ) for child in self.children
                                        if paths is None or child in paths)

        elif cache.is_dataset():
            self._pull_dataset(
                time_now, exclude_uploaded)  # XXX actual pull happens in here

        else:
            raise NotImplementedError(self)
Ejemplo n.º 6
0
 def __init__(self, id, name, cache=None):
     super().__init__(id, cache)
     self.name = name
     now = GetTimeNow()
     self.created = now._start_time
     self.updated = now._start_time
     self.checksum = 'lolnone'
     self.chunksize = 4096
     self.file_id = 'asdfasdfasdf'
Ejemplo n.º 7
0
def export_single_dataset(dataset_id, updated):  # FIXME this is being called way too often, that message queue is super hefty
    sid = 'state-' + dataset_id
    time_now = GetTimeNow()
    conn.incr(sid)
    ret_val_exp(dataset_id, updated, time_now)
    state = conn.get(sid)
    # FIXME I'm seeing -1 and -2 in here somehow
    conn.decr(sid)  # we always go back 2 either to none or queued
    conn.decr(sid)
    if state == _qed_run:
        export_single_dataset.delay(dataset_id)

    status_report()
Ejemplo n.º 8
0
def test():
    check_sheet_updates()
    return
    status_report()
    #breakpoint()
    return
    reset_redis_keys(conn)
    populate_existing_redis(conn)

    datasets = datasets_remote_from_project_id(project_id)
    datasets = sorted(datasets, key=lambda r:r.id)[:3]
    dataset = datasets[0] # 0, 1, 2 # ok, unfetched xml children, ok
    dataset_id = dataset.id
    updated = dataset.updated
    time_now = GetTimeNow()
    ret_val_exp(dataset_id, updated, time_now)
Ejemplo n.º 9
0
def heartbeat():  # FIXME this has to run in a separate priority queue with its own worker
    #print(shared_state)
    #print(dataset_status)
    keys = conn.keys()
    vals = [int(conn.get(k)) for k in keys if b'state-N:dataset:' in k]
    fails = [k for k in keys if b'failed-' in k and conn.get(k)]
    #vals = shared_state.values()
    ln = len([1 for n in vals if not n or n == _none])
    lf = len(fails)
    lq = len([1 for n in vals if n == _qed])
    lr = len([1 for n in vals if n == _run])
    lqr = len([1 for n in vals if n == _qed_run])
    time_now = GetTimeNow()
    log.info(f'HEARTBEAT :n {ln} :f {lf} :q {lq} :r {lr} :qr {lqr}')
    with open('/tmp/cron-log', 'at') as f:
        f.write(f':t {time_now.START_TIMESTAMP_LOCAL} :n {ln} :f {lf} :q {lq} :r {lr} :qr {lqr}\n')