Ejemplo n.º 1
0
def write_prediction_results(labels, tokens, output_file):
    lines = []
    for sentence_tokens, sentence_labels in zip(tokens, labels):
        for token, label in zip(sentence_tokens, sentence_labels):
            if len(label.split("-")) < 2:
                lines.append(f'{token} {NO_ENTITY_MARK}')
            else:
                tag_prefix, tag_body = label.split("-")
                tag_body = PREDICTED_TAG_BODIES_MAPPING.get(
                    tag_body, DEFAULT_TAG_BODY)
                lines.append(f'{token} {tag_prefix}-{tag_body}')
            #lines.append(f'{token} {label}')
            #lines.append(f'{token} {label.split("-")[-1].capitalize()}')
        lines.append('')
    write_lines(output_file, lines)
Ejemplo n.º 2
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--training_data',
                        type=str,
                        default='../../data/conll2003ru')
    parser.add_argument('--text', type=str, default='../../raw.txt')
    parser.add_argument('--output',
                        type=str,
                        default='../../raw.lstm-crf.predictions.txt')

    args = parser.parse_args()

    with Path(PARAMS).open() as f:
        params = json.load(f)

    params['words'] = str(Path(args.training_data, 'vocab.words.txt'))
    params['chars'] = str(Path(args.training_data, 'vocab.chars.txt'))
    params['tags'] = str(Path(args.training_data, 'vocab.tags.txt'))
    params['glove'] = str(Path(args.training_data, 'glove.npz'))

    text = read(args.text)

    estimator = tf.estimator.Estimator(model_fn, MODELDIR, params=params)
    predict_inpf = functools.partial(predict_input_fn, text)
    for pred in estimator.predict(predict_inpf):
        #print(pred.keys())
        lines = pretty_print(text, pred['tags'])
        write_lines(args.output, lines)
        break
Ejemplo n.º 3
0
def convert(input_folder, output_folder):
    # Change format
    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)

    test_a_filename = f'{input_folder}/{TEST_A_CONLL2003_FILE_NAME}'
    words, tags = reformat_file(test_a_filename)
    write_lines(f'{output_folder}/testa.words.txt', words)
    write_lines(f'{output_folder}/testa.tags.txt', tags)

    test_b_filename = f'{input_folder}/{TEST_B_CONLL2003_FILE_NAME}'
    words, tags = reformat_file(test_b_filename if os.path.
                                isfile(test_b_filename) else test_a_filename)
    write_lines(f'{output_folder}/testb.words.txt', words)
    write_lines(f'{output_folder}/testb.tags.txt', tags)

    words, tags = reformat_file(f'{input_folder}/{TRAIN_CONLL2003_FILE_NAME}')
    write_lines(f'{output_folder}/train.words.txt', words)
    write_lines(f'{output_folder}/train.tags.txt', tags)

    # Build vocabs
    make_vocabs(output_folder)

    # Build embeddings
    make_embeddings(output_folder, '/home/dima/models/ArModel100w2v.txt')
Ejemplo n.º 4
0
def write_prediction_results(labels, tokens, output_file):
    lines = []
    for sentence_tokens, sentence_labels in zip(tokens, labels):
        for token, label in zip(sentence_tokens, sentence_labels):
            lines.append(f'{token} {label.split("-")[-1].capitalize()}')
    write_lines(output_file, lines)