async def test_node(self): with temporary_directory() as storedir: store = Store(storedir, PERSIST) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) streams = await run_nodes(dataset, [node], store, PERSIST) self.assertNodeOutput(streams[0], node)
def make_dataset(self, files, name, time_added=None): setid = SetID.random() files = [ File(idx=i, mtime=int(stat.st_mtime * 1000), path=path, size=stat.st_size) for i, (path, stat) in enumerate( (path, os.stat(path)) for path in files) ] time_added = int(time.time() * 1000) if time_added is None else time_added dataset = Dataset( collection=self.name, files=files, name=name, #status=8, # pending state see marv/model time_added=time_added, timestamp=max(x.mtime for x in files), setid=setid) storedir = self.config.marv.storedir store = Store(storedir, self.nodes) store.add_dataset(dataset) self.render_detail(dataset) return dataset
async def test_node(self): with temporary_directory() as storedir: store = Store(storedir, {}) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) streams = await run_nodes(dataset, [node], store) assert streams == [None]
async def test_node(self): with temporary_directory() as storedir: store = Store(storedir, {}) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) streams = await run_nodes(dataset, [fulltext, collect], store) assert 'hello' in streams[0][0].words assert any('hello' in x for x in streams[1])
def test_node(self): with temporary_directory() as storedir: store = Store(storedir, PERSIST) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) sink = make_sink(node) run_nodes(dataset, [sink], store, PERSIST) self.assertNodeOutput(sink.stream, node)
def test_node(self): with temporary_directory() as storedir: store = Store(storedir, {}) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) sink = make_sink(collect) run_nodes(dataset, [sink], store) self.assertEquals(sink.stream, ['Success'])
def test_node(self): with temporary_directory() as storedir: store = Store(storedir, {}) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) sink = make_sink(node) run_nodes(dataset, [sink], store) self.assertEqual(len(sink.stream), 0)
def run(self, setid, selected_nodes=None, deps=None, force=None, keep=None, update_detail=None, update_listing=None, excluded_nodes=None, cachesize=None): excluded_nodes = excluded_nodes or [] dataset = Dataset.query.filter(Dataset.setid == str(setid))\ .options(db.joinedload(Dataset.files))\ .one() collection = self.collections[dataset.collection] selected_nodes = set(selected_nodes or []) if not (selected_nodes or update_listing or update_detail): selected_nodes.update(collection.listing_deps) selected_nodes.update(collection.detail_deps) persistent = collection.nodes try: nodes = [ persistent[name] if not ':' in name else find_obj(name) for name in selected_nodes if name not in excluded_nodes if name != 'dataset' ] except KeyError as e: raise UnknownNode(dataset.collection, e.args[0]) nodes.sort() storedir = app.site.config.marv.storedir store = Store(storedir, persistent) changed = False try: if nodes: changed = run_nodes(dataset, nodes, store, force=force, persistent=persistent, deps=deps, cachesize=cachesize) except: raise else: if changed or update_detail: collection.render_detail(dataset) log.verbose('%s detail rendered', setid) if changed or update_listing: collection.update_listings([dataset]) log.verbose('%s listing rendered', setid) finally: if not keep: for tmpdir in store.pending.values(): store.logdebug('Cleaning up %r', tmpdir) shutil.rmtree(tmpdir) store.pending.clear()
def test_node(self): with temporary_directory() as storedir: store = Store(storedir, {}) dataset = make_dataset(self.BAGS) store.add_dataset(dataset) sink1 = make_sink(fulltext) sink2 = make_sink(collect) run_nodes(dataset, [sink1, sink2], store) assert 'hello' in sink1.stream[0].words assert any('hello' in x for x in sink2.stream)
async def make_dataset(self, connection, files, name, time_added=None, discarded=False, setid=None, status=0, timestamp=None, _restore=None): # pylint: disable=too-many-arguments time_added = int(utils.now() * 1000) if time_added is None else time_added collection = await CollectionModel.filter( name=self.name).using_db(connection).first() dataset = await Dataset.create(collection=collection, name=name, discarded=discarded, status=status, time_added=time_added, timestamp=0, setid=setid or SetID.random(), acn_id=collection.acn_id, using_db=connection) if _restore: files = [ File(dataset=dataset, idx=i, **x) for i, x in enumerate(files) ] else: files = [ File(dataset=dataset, idx=i, mtime=int(utils.mtime(path) * 1000), path=path, size=stat.st_size) for i, (path, stat) in enumerate( (path, utils.stat(path)) for path in files) ] dataset.timestamp = timestamp or max(x.mtime for x in files) await dataset.save(using_db=connection) await File.bulk_create(files, using_db=connection) await dataset.fetch_related('files', using_db=connection) storedir = self.config.marv.storedir store = Store(storedir, self.nodes) store.add_dataset(dataset, exists_okay=_restore) self.render_detail(dataset) return dataset
def render_listing(self, dataset): storedir = self.config.marv.storedir setdir = os.path.join(storedir, str(dataset.setid)) store = Store(storedir, self.nodes) funcs = make_funcs(dataset, setdir, store) values = [] for col, functree in self.listing_functions: value = calltree(functree, funcs) transform = FORMATTER_MAP[col.formatter + ('[]' if col.islist else '')] value = transform(value) values.append(value) row = { 'id': dataset.id, 'setid': str(dataset.setid), 'tags': ['#TAGS#'], 'values': values } fields = {} relfields = {} relations = self.model.relations for filter_spec, functree in self.filter_functions: value = calltree(functree, funcs) transform = FILTER_MAP[filter_spec.value_type] value = transform(value) target = relfields if filter_spec.name in relations else fields target[filter_spec.name] = value return row, fields, relfields
def render_detail(self, dataset): storedir = self.config.marv.storedir setdir = os.path.join(storedir, str(dataset.setid)) try: os.mkdir(setdir) except OSError: pass assert os.path.isdir(setdir), setdir store = Store(storedir, self.nodes) funcs = make_funcs(dataset, setdir, store) summary_widgets = [ x[0]._reader for x in # pylint: disable=protected-access [ store.load(setdir, node, default=None) for node in self.detail_summary_widgets ] if x ] sections = [ x[0]._reader for x in # pylint: disable=protected-access [ store.load(setdir, node, default=None) for node in self.detail_sections ] if x ] dct = { 'title': calltree(self.detail_title, funcs), 'sections': sections, 'summary': { 'widgets': summary_widgets } } detail = Detail.new_message(**dct).as_reader() dct = detail_to_dict(detail) fd = os.open(os.path.join(setdir, '.detail.json'), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o666) jsonfile = os.fdopen(fd, 'w') json.dump(dct, jsonfile, sort_keys=True) jsonfile.close() os.rename(os.path.join(setdir, '.detail.json'), os.path.join(setdir, 'detail.json')) self._check_outdated(dataset)
def make_dataset(self, files, name, time_added=None, discarded=None, setid=None, status=None, timestamp=None, _restore=None): setid = setid or SetID.random() if _restore: files = [File(idx=i, **x) for i, x in enumerate(files)] else: files = [File(idx=i, mtime=int(utils.mtime(path) * 1000), path=path, size=stat.st_size) for i, (path, stat) in enumerate((path, os.stat(path)) for path in files)] time_added = int(utils.now() * 1000) if time_added is None else time_added dataset = Dataset(collection=self.name, files=files, name=name, discarded=discarded, status=status, time_added=time_added, timestamp=timestamp or max(x.mtime for x in files), setid=setid) storedir = self.config.marv.storedir store = Store(storedir, self.nodes) store.add_dataset(dataset, exists_okay=_restore) self.render_detail(dataset) return dataset
def load(self, node): from marv_store import Store # pylint: disable=import-outside-toplevel store = Store(str(self._storedir), {node.name: node}) return store.load(str(self._setdir), node, default=None)
async def run(self, setid, selected_nodes=None, deps=None, force=None, keep=None, force_dependent=None, update_detail=None, update_listing=None, excluded_nodes=None, cachesize=None): # pylint: disable=too-many-arguments,too-many-locals,too-many-branches assert not force_dependent or selected_nodes excluded_nodes = set(excluded_nodes or []) async with scoped_session(self.db) as txn: dataset = await Dataset.get(setid=setid)\ .prefetch_related('collection', 'files')\ .using_db(txn) collection = self.collections[dataset.collection.name] selected_nodes = set(selected_nodes or []) if not (selected_nodes or update_listing or update_detail): selected_nodes.update(collection.listing_deps) selected_nodes.update(collection.detail_deps) persistent = collection.nodes try: nodes = { persistent[name] if ':' not in name else Node.from_dag_node( find_obj(name)) for name in selected_nodes if name not in excluded_nodes if name != 'dataset' } except KeyError as exc: raise ConfigError( f'Collection {collection.name!r} has no node {exc}') if force_dependent: nodes.update(x for name in selected_nodes for x in persistent[name].dependent) nodes = sorted(nodes) storedir = self.config.marv.storedir store = Store(storedir, persistent) changed = False try: if nodes: changed = await run_nodes(dataset, nodes, store, force=force, persistent=persistent, deps=deps, cachesize=cachesize, site=self) finally: if not keep: for stream in store.pending: if stream.streamfile: stream.streamfile.close() for stream in store.readstreams: if stream.streamfile: stream.streamfile.close() for tmpdir, tmpdir_fd in store.pending.values(): store.logdebug('Cleaning up %r', tmpdir) shutil.rmtree(tmpdir) fcntl.flock(tmpdir_fd, fcntl.LOCK_UN) os.close(tmpdir_fd) store.pending.clear() if changed or update_detail: collection.render_detail(dataset) log.verbose('%s detail rendered', setid) if changed or update_listing: await collection.update_listings([dataset]) log.verbose('%s listing rendered', setid) return changed
def load(self, node): from marv_store import Store storedir = current_app.site.config.marv.storedir store = Store(storedir, {node.name: node}) return store.load(self._setdir, node, default=None)