Exemple #1
0
def prepare_worker_(document_path):
    reader = trec_utils.TRECTextReader([document_path],
                                       encoding=prepare_worker.encoding)
    num_documents = 0

    for doc_id, doc_text in reader.iter_documents(replace_digits=True,
                                                  strip_html=True):
        # Values to be returned.
        instances_and_labels = []

        if doc_id not in prepare_worker.document_assocs:
            logging.debug('Document "%s" does not exist in associations.',
                          doc_id)

            continue

        def _callback(num_yielded_windows, remaining_tokens):
            if not num_yielded_windows:
                logging.error('Document "%s" (%s) yielded zero instances; '
                              'remaining tokens: %s.',
                              doc_id, doc_text, remaining_tokens)

        padding_token = (
            '</s>' if not prepare_worker.args.no_padding else None)

        # Ignore end-of-sentence.
        windowed_word_stream = io_utils.windowed_translated_token_stream(
            io_utils.replace_numeric_tokens_stream(
                io_utils.token_stream(
                    io_utils.lowercased_stream(
                        io_utils.filter_non_latin_stream(
                            io_utils.filter_non_alphanumeric_stream(
                                iter(doc_text)))),
                    eos_chars=[])),
            prepare_worker.args.window_size,
            prepare_worker.words,
            stride=prepare_worker.args.stride,
            padding_token=padding_token,
            callback=_callback)

        # To determine the matrix indices of the entities associated with
        # the document.
        entity_ids = [entity_id for entity_id in
                      prepare_worker.document_assocs[doc_id]]

        label = _candidate_centric_label(entity_ids)
        partition_function = float(sum(label.values()))

        for index in label:
            label[index] /= partition_function

        for instance in windowed_word_stream:
            instances_and_labels.append((doc_id, instance, label))

        prepare_worker.result_queue.put(
            (doc_id, instances_and_labels, label))

        num_documents += 1

    return num_documents
def parse_query(unsplitted_terms):
    assert isinstance(unsplitted_terms, str)

    unsplitted_terms = remove_parentheses_re.sub(
        r'\1', unsplitted_terms.strip())
    unsplitted_terms = unsplitted_terms.replace('/', ' ')
    unsplitted_terms = unsplitted_terms.replace('-', ' ')

    return list(io_utils.token_stream(
        io_utils.lowercased_stream(
            io_utils.filter_non_latin_stream(
                io_utils.filter_non_alphanumeric_stream(
                    iter(unsplitted_terms)))), eos_chars=[]))