Esempio n. 1
0
def db_insert_(name, offset, count, chunk):
    log('Inserting source')
    source = Source.find(name)
    path = source.get()
    records = source.load(path)
    records = log_progress(records, total=count)
    records = head(skip(records, offset), count)

    db = get_db(host=WORKER_HOST)
    docs = (_.as_bson for _ in records)
    chunk_insert(db[SOURCE], docs, chunk)
Esempio n. 2
0
def queue_insert_(annotators, offset, count, chunk):
    log('Annotators: %s; offset: %d, count: %d, chunk: %d',
        ', '.join(annotators), offset, count or -1, chunk)

    db = get_db(host=WORKER_HOST)
    ids = read_index(db[SOURCE], offset)
    ids = log_progress(ids, total=count)

    ids = head(ids, count)
    chunks = group_chunks(ids, size=chunk)

    connection = get_connection(host=WORKER_HOST)
    queues = dict(get_queues(annotators, connection))
    for chunk in chunks:
        for annotator in annotators:
            queue = queues[annotator]
            enqueue(queue, task, chunk)
Esempio n. 3
0
def db_show_():
    log('Counting docs')
    db = get_db(host=WORKER_HOST)
    for name in [SOURCE] + ANNOTATORS:
        count = db[name].estimated_document_count()
        print('{count:>10} {name}'.format(name=name, count=count))
Esempio n. 4
0
def dump_raw_(path, annotators, count, chunk):
    log('Dumping %s', ', '.join(annotators))
    db = get_db(host=WORKER_HOST)
    records = read_raw(db, annotators, count, chunk)
    records = log_progress(records, total=count)
    dump_raw__(records, path)
Esempio n. 5
0
def db_clear_():
    log('Removing docs')
    db = get_db(host=WORKER_HOST)
    for name in [SOURCE] + ANNOTATORS:
        db[name].remove()