def db_insert_(name, offset, count, chunk): log('Inserting source') source = Source.find(name) path = source.get() records = source.load(path) records = log_progress(records, total=count) records = head(skip(records, offset), count) db = get_db(host=WORKER_HOST) docs = (_.as_bson for _ in records) chunk_insert(db[SOURCE], docs, chunk)
def queue_insert_(annotators, offset, count, chunk): log('Annotators: %s; offset: %d, count: %d, chunk: %d', ', '.join(annotators), offset, count or -1, chunk) db = get_db(host=WORKER_HOST) ids = read_index(db[SOURCE], offset) ids = log_progress(ids, total=count) ids = head(ids, count) chunks = group_chunks(ids, size=chunk) connection = get_connection(host=WORKER_HOST) queues = dict(get_queues(annotators, connection)) for chunk in chunks: for annotator in annotators: queue = queues[annotator] enqueue(queue, task, chunk)
def db_show_(): log('Counting docs') db = get_db(host=WORKER_HOST) for name in [SOURCE] + ANNOTATORS: count = db[name].estimated_document_count() print('{count:>10} {name}'.format(name=name, count=count))
def dump_raw_(path, annotators, count, chunk): log('Dumping %s', ', '.join(annotators)) db = get_db(host=WORKER_HOST) records = read_raw(db, annotators, count, chunk) records = log_progress(records, total=count) dump_raw__(records, path)
def db_clear_(): log('Removing docs') db = get_db(host=WORKER_HOST) for name in [SOURCE] + ANNOTATORS: db[name].remove()