def section_texts(texts, size): for source, text in enumerate(texts): tokens = space_tokenize(text) chunks = group_chunks(tokens, size) for chunk in chunks: start, stop = chunk[0].start, chunk[-1].stop yield Section(source, start, stop, text[start:stop])
def map(cls, texts, host=DEEPPAVLOV_HOST, port=DEEPPAVLOV_PORT, section_size=DEEPPAVLOV_SECTION, batch_size=DEEPPAVLOV_BATCH): texts = patch_texts(texts) sections = section_texts(texts, section_size) batches = group_chunks(sections, batch_size) sections = map_(cls, batches, host, port) groups = group_sections(sections) for group in groups: yield merge_markups(cls, group)
def queue_retry_(chunk): log('Retrying') connection = get_connection(host=WORKER_HOST) queue = get_queue(FAILED, connection=connection) ids = annotators_ids(queue.jobs) queue.empty() for annotator in ids: annotator_ids = ids[annotator] annotator_ids = log_progress(annotator_ids, prefix=annotator) chunks = group_chunks(annotator_ids, size=chunk) queue = get_queue(annotator, connection=connection) for chunk_ in chunks: enqueue(queue, task, chunk_)
def queue_insert_(annotators, offset, count, chunk): log('Annotators: %s; offset: %d, count: %d, chunk: %d', ', '.join(annotators), offset, count or -1, chunk) db = get_db(host=WORKER_HOST) ids = read_index(db[SOURCE], offset) ids = log_progress(ids, total=count) ids = head(ids, count) chunks = group_chunks(ids, size=chunk) connection = get_connection(host=WORKER_HOST) queues = dict(get_queues(annotators, connection)) for chunk in chunks: for annotator in annotators: queue = queues[annotator] enqueue(queue, task, chunk)