def db_insert_(name, offset, count, chunk): log('Inserting source') source = Source.find(name) path = source.get() records = source.load(path) records = log_progress(records, total=count) records = head(skip(records, offset), count) db = get_db(host=WORKER_HOST) docs = (_.as_bson for _ in records) chunk_insert(db[SOURCE], docs, chunk)
def queue_retry_(chunk): log('Retrying') connection = get_connection(host=WORKER_HOST) queue = get_queue(FAILED, connection=connection) ids = annotators_ids(queue.jobs) queue.empty() for annotator in ids: annotator_ids = ids[annotator] annotator_ids = log_progress(annotator_ids, prefix=annotator) chunks = group_chunks(annotator_ids, size=chunk) queue = get_queue(annotator, connection=connection) for chunk_ in chunks: enqueue(queue, task, chunk_)
def queue_insert_(annotators, offset, count, chunk): log('Annotators: %s; offset: %d, count: %d, chunk: %d', ', '.join(annotators), offset, count or -1, chunk) db = get_db(host=WORKER_HOST) ids = read_index(db[SOURCE], offset) ids = log_progress(ids, total=count) ids = head(ids, count) chunks = group_chunks(ids, size=chunk) connection = get_connection(host=WORKER_HOST) queues = dict(get_queues(annotators, connection)) for chunk in chunks: for annotator in annotators: queue = queues[annotator] enqueue(queue, task, chunk)
def dump_norm_(source, target): records = load_raw(source) records = norm_raw(records) records = log_progress(records) dump_norm__(records, target)
def dump_raw_(path, annotators, count, chunk): log('Dumping %s', ', '.join(annotators)) db = get_db(host=WORKER_HOST) records = read_raw(db, annotators, count, chunk) records = log_progress(records, total=count) dump_raw__(records, path)