def get_session_id(): role_id = stringify(request.authz.id) or 'anonymous' session_id = None if hasattr(request, '_session_id'): session_id = stringify(request._session_id) session_id = session_id or Job.random_id() return '%s:%s' % (role_id, session_id)
def reingest_collection(collection, job_id=None, index=False, flush=True): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() if flush: ingest_flush(collection) for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def reingest_collection(collection, job_id=None, index=False): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() aggregator = get_aggregator(collection) aggregator.delete(origin=OP_ANALYZE) aggregator.delete(origin=OP_INGEST) aggregator.close() for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" try: content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) # if the job_id is not set yet and path.is_dir(), we know it is the # first iteration and we don't create an initial root folder as parent # to be consistent with the behaviour of alephclient if path.is_dir() and job_id is None: document = None job_id = Job.random_id() else: meta = {"file_name": path.name} document = Document.save( collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, ) db.session.commit() job_id = job_id or Job.random_id() proxy = document.to_proxy() ingest_flush(collection, entity_id=proxy.id) ingest_entity(collection, proxy, job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id) except OSError: log.exception("Cannot crawl directory: %s", path)
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, "continue_on_error": settings.CONTINUE_ON_ERROR, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {})
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { 'crawler': self.name, 'run_id': run_id or Job.random_id(), 'incremental': settings.INCREMENTAL } if incremental is not None: state['incremental'] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) Queue.queue(self.init_stage, state, {})
def run(self, incremental=None, run_id=None): """Queue the execution of a particular crawler.""" state = { "crawler": self.name, "run_id": run_id or Job.random_id(), "incremental": settings.INCREMENTAL, } if incremental is not None: state["incremental"] = incremental # Cancel previous runs: self.cancel() # Flush out previous events data but keep the counts: Event.delete_data(self) init_stage = self.get(self.init_stage) Queue.queue(init_stage, state, {})
def load_entities(foreign_id, infile, unsafe=False): """Load FtM entities from the specified iJSON file.""" collection = ensure_collection(foreign_id, foreign_id) def read_entities(): for idx in count(1): line = infile.readline() if not line: return if idx % 1000 == 0: log.info("[%s] Loaded %s entities from: %s", foreign_id, idx, infile.name) yield json.loads(line) job_id = Job.random_id() log.info("Loading [%s]: %s", job_id, foreign_id) bulk_write(collection, read_entities(), job_id=job_id, unsafe=unsafe) update_collection(collection)
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) meta = {'file_name': path.name} document = Document.save(collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta) db.session.commit() job_id = job_id or Job.random_id() ingest_entity(collection, document.to_proxy(), job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id)
def get_stage(collection, stage, job_id=None): job_id = job_id or Job.random_id() job = Job(kv, collection.foreign_id, job_id) return job.get_stage(stage)
# execute in the aleph shell: # exec(open("reingest_partial.py").read()) from servicelayer.jobs import Job from aleph.queues import ingest_entity from aleph.model import Collection, Document job_id = Job.random_id() collection = Collection.by_id(125) with open('collection_ftm_failures.csv', 'r') as f: for document_id in f: print("reingest " + document_id) document = Document.by_id(document_id) proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=True)
def get_session_id(): role_id = stringify(request.authz.id) or "anonymous" session_id = stringify(request._session_id) session_id = session_id or Job.random_id() return "%s:%s" % (role_id, session_id)
def get_stage(collection, stage, job_id=None): dataset = dataset_from_collection(collection) job_id = job_id or Job.random_id() job = Job(kv, dataset, job_id) return job.get_stage(stage)