Exemple #1
0
def tag():
    if request.method == 'POST':
        data = request.get_json()
        text = data['text']
        if data['split_sentences']:
            sentences = split_sentences(text)
        else:
            sentences = text

        if data['tokenize'] or data['split_sentences']:
            tokenized_sentences = [tokenize(s) for s in sentences]
        else:
            tokenized_sentences = text

        count = 0
        output = []
        for words in tokenized_sentences:
            if len(words) == 0:
                continue
            # Lowercase sentence
            if model.parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if model.parameters['zeros']:
                line = zero_digits(line)
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=model.parameters['lower'])
            input = create_input(sentence, model.parameters, False)
            # Decoding
            if model.parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if model.parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(
                words
            ), "Predictions have different length than sentence. Something went wrong."
            output.append(list(zip(words, y_preds)))
            count += 1
            if count % 100 == 0:
                logging.info(count)

        return jsonify(output)
Exemple #2
0
def extract_predictions_from_raw_text(model_path, tokens, pos):
    model = Model(model_path=model_path)
    parameters = model.parameters
    if 'language_model' not in parameters:
        parameters['language_model'] = False
    # Load reverse mappings
    word_to_id, char_to_id, tag_to_id = [{
        v: k
        for k, v in x.items()
    } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]
    pos_to_id, ortho_to_id, segment_to_id = [{
        v: k
        for k, v in x.items()
    } for x in [model.id_to_pos, model.id_to_ortho, model.id_to_segment]]
    word_to_id_1 = {v: k for k, v in model.id_to_word_1.items()}
    # Load the model
    _, f_eval = model.build(training=False, **parameters)
    model.reload()
    id_to_tag = model.id_to_tag
    sentence_cl = ' '.join(tokens)
    if parameters['lower']:
        sentence_cl = sentence_cl.lower()
    # Replace all digits with zeros
    if parameters['zeros']:
        sentence_cl = zero_digits(sentence_cl)
    tokens = sentence_cl.split(' ')
    ortho = [get_ortho_feature(w) for w in tokens]
    assert len(tokens) == len(pos) == len(ortho)
    input_dict = {'words': tokens, 'pos': pos, 'ortho': ortho}
    # Prepare input
    sentence = prepare_sentence(input_dict,
                                word_to_id,
                                char_to_id,
                                pos_to_id,
                                ortho_to_id,
                                segment_to_id,
                                word_to_id_1,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, add_label=False)
    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [id_to_tag[y_pred] for y_pred in y_preds]
    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)
    y_preds = resolve_inconsistencies(y_preds)
    return tokens, y_preds
Exemple #3
0
def tag(model, line):
    # Load existing model
    print("Loading model...")
    model = Model(model_path=model)
    parameters = model.parameters

    # Load reverse mappings
    word_to_id, char_to_id, tag_to_id = [{
        v: k
        for k, v in x.items()
    } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

    # Load the model
    _, f_eval = model.build(training=False, **parameters)
    model.reload()

    start = time.time()

    print('Tagging...')
    words_ini = line.rstrip().split()

    # Replace all digits with zeros
    if parameters['zeros']:
        line = zero_digits(line)
    words = line.rstrip().split()
    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)
    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)
    # Write tags
    assert len(y_preds) == len(words)

    print('---- sentence tagged in %.4fs ----' % (time.time() - start))

    return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
Exemple #4
0
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id):
    count = 0
    all_ypreds = list()
    all_tokens = list()
    for line in doc.sentences:
        toks_text = [x.orth_ for x in line.tokens]
        # line = ' '.join(toks_text)
        if toks_text:  # WL edit: used to be 'if line', was crashing on '\n' lines
            # Lowercase sentence
            if parameters['lower']:
                toks_text = [line.lower() for line in toks_text]
            # Replace all digits with zeros
            if parameters['zeros']:
                toks_text = [zero_digits(line) for line in toks_text]
            # Prepare input
            sentence = prepare_sentence(toks_text,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(toks_text)

            # strip IOB prefixes
            y_preds = [x.split('-')[-1] for x in y_preds]

            all_ypreds.append(y_preds)
            all_tokens.append(toks_text)

        count += 1
        if count % 100 == 0:
            print count

    return (all_ypreds, all_tokens)
    def predicts(self, line):
        if line:
            # Save original bigrams
            bigram_sent = self.to_bigram(line, 0).strip().split()

            # Replave all digits with zeros
            line = zero_digits(line)
            input_seq = self.to_bigram(line, 0).strip().split()

            # Prepare input
            sentence = prepare_sentence(input_seq,
                                        self.word_to_id,
                                        self.char_to_id,
                                        lower=self.parameters['lower'])
            input = create_input(sentence, self.parameters, False)
            if self.parameters['crf']:
                y_preds = np.array(self.f_eval(*input))[1:-1]
            else:
                y_preds = self.f_eval(*input).argmax(axis=1)
            tags = [self.id_to_tag[y_pred] for y_pred in y_preds]

            # Output tags in the IOB2 format
            if self.parameters['tag_scheme'] == 'iobes':
                tags = iobes_iob(tags)
            print(tags)
            # Make output form
            out_form = ""
            unigram_sent = self.bigrams_to_unigrams(bigram_sent)

            for i in range(len(tags)):
                if tags[i].startswith('B'):
                    out_form += '<' + unigram_sent[i]
                elif tags[i].startswith('I'):
                    if i == len(tags) - 1:
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    elif tags[i + 1] == 'O':
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    else:
                        out_form += unigram_sent[i]
                else:
                    out_form += unigram_sent[i]
            return out_form
Exemple #6
0
def NER_for_sentence(sentence):
    sentence = utils.remove_numbers(sentence)
    sentence = utils.remove_punctua(sentence)
    sentence = utils.remove_whitespace(sentence)
    str_words = sentence.split()
    # print(str_words)

    data = loader.prepare_sentence(str_words, word_to_id, char_to_id,
                                   tag_to_id)
    sentence_in = data['words']
    sentence_in = torch.tensor(sentence_in, dtype=torch.long)

    cap_in = data['caps']
    cap_in = torch.tensor(cap_in, dtype=torch.long)

    chars2 = data['chars']
    chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True)
    d = {}
    for i, ci in enumerate(chars2):
        for j, cj in enumerate(chars2_sorted):
            if ci == cj and not j in d and not i in d.values():
                d[j] = i
                continue
    chars2_length = [len(w) for w in chars2_sorted]
    char_maxl = max(chars2_length)
    chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int')
    for i, c in enumerate(chars2_sorted):
        chars2_mask[i, :chars2_length[i]] = c
    chars2_mask = torch.tensor(chars2_mask, dtype=torch.long)

    val, out = model(sentence_in, chars2_mask, cap_in, chars2_length, d)
    predicted_id = out
    # print(out)
    id_to_tag = {v: i for i, v in tag_to_id.items()}
    tags = [id_to_tag[id.item()] for id in predicted_id]
    result = []
    for item in zip(str_words, tags):
        result.append(item)

    return result
Exemple #7
0
print 'Tagging...'
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(words)

            if opts.outputFormat == 'json':
Exemple #8
0
def ner():
    global model
    global f_eval
    global parameters
    global word_to_id
    global char_to_id
    global tag_to_id
    model_name = request.json["model"]
    words = request.json["words"]
    begin_end = request.json["begin_end"]
    if model is None:
        ## Model loading
        print "Loading model " + model_name + ".."
        model = Model(model_path="models/" + models[model_name])
        parameters = model.parameters

        # Load reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

        # Load the model
        _, f_eval = model.build(training=False, **parameters)
        model.reload()
#     else:
#         parameters = model.parameters
#         word_to_id, char_to_id, tag_to_id = [
#             {v: k for k, v in x.items()}
#             for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
#         ]

# Lowercase sentence
    if parameters['lower']:
        words = [w.lower() for w in words]
    # Replace all digits with zeros
    if parameters['zeros']:
        words = [zero_digits(w) for w in words]
    words = [w if not w.isupper() else w.title() for w in words]

    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)

    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]

    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)

    # Write tags
    assert len(y_preds) == len(words)  # TODO:remove assert?

    ents = [{
        "start_char": b,
        "end_char": e,
        "label": label
    } for (b, e), label in zip(begin_end, y_preds) if label != "O"]

    return json.dumps({"ents": ents})
def run_tagging(model,
                f_eval,
                parameters,
                word_to_id,
                char_to_id,
                tag_to_id,
                opts_input="",
                opts_output="",
                opts_delimiter="__",
                opts_outputFormat=""):
    # Check parameters validity
    assert opts_delimiter
    assert os.path.isfile(opts_input)

    #set environment to use gpu

    f_output = codecs.open(opts_output, 'w', 'utf-8')
    start = time.time()
    logger.info('Tagging...')
    with codecs.open(opts_input, 'r', 'utf-8') as f_input:
        count = 0
        for line in f_input:
            words_ini = line.rstrip().split()
            if line:
                # Lowercase sentence
                if parameters['lower']:
                    line = line.lower()
                # Replace all digits with zeros
                if parameters['zeros']:
                    line = zero_digits(line)
                words = line.rstrip().split()
                # Prepare input
                sentence = prepare_sentence(words,
                                            word_to_id,
                                            char_to_id,
                                            lower=parameters['lower'])
                input = create_input(sentence, parameters, False)
                # Decoding
                if parameters['crf']:
                    y_preds = np.array(f_eval(*input))[1:-1]
                else:
                    y_preds = f_eval(*input).argmax(axis=1)
                y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
                # Output tags in the IOB2 format
                if parameters['tag_scheme'] == 'iobes':
                    y_preds = iobes_iob(y_preds)
                # Write tags
                assert len(y_preds) == len(words)

                if opts_outputFormat == 'json':
                    f_output.write(
                        json.dumps({
                            "text": ' '.join(words),
                            "ranges": iob_ranges(y_preds)
                        }))
                else:
                    #logger.info( "write out tags..."
                    f_output.write(
                        '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y)
                                          for w, y in zip(words_ini, y_preds)))
            else:
                f_output.write('\n')
            count += 1
            # if count % 100 == 0:
            #     logger.info( count

    logger.info('---- %i lines tagged in %.4fs ----' %
                (count, time.time() - start))
    f_output.close()
    logger.info(opts_output)
    logger.info("")
    return opts_output + " has been tagged!"


# def main():
#     logger.info( "executed"

# if __name__ == '__main__':
#     main()
Exemple #10
0
        v: k
        for k, v in x.items()
    } for x in [
        model.id_to_word, model.id_to_slb, model.id_to_char, model.id_to_tag,
        model.id_to_pos
    ]]
    id_to_tag = model.id_to_tag

    # Load the model
    _, f_eval = model.build(training=False, **parameters)
    model.reload()

    start = time.time()

    print 'Running...NER'
    test_data = prepare_sentence(test_sentences, word_to_id, slb_to_id,
                                 char_to_id, pos_to_id)
    gazette_dict = make_gazette_to_dic(dict_path)

    gazette_dict_for, gazette_dict_len = dict(), dict()
    with open(dict_path, 'r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            words, tag = line[0], line[1]

            if len(words) > 3:
                gazette_dict_len[words] = len(words)
                gazette_dict_for[words] = tag

    gazette_dict_len = sorted(gazette_dict_len.iteritems(),
                              key=itemgetter(1),
                              reverse=True)
Exemple #11
0
print 'Tagging...'
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words, word_to_id, char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(words)
            
            if opts.outputFormat == 'json':
                f_output.write(json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) }))
Exemple #12
0
def extract_tagger_predictions(model_path,
                               span_path,
                               output_path=None,
                               f_eval=None,
                               parameters=None,
                               return_raw_predictions=False):
    assert file_exists(span_path)
    documents = read_pickle(span_path)
    if not f_eval:
        model = Model(model_path=model_path)
        parameters = model.parameters
        if 'language_model' not in parameters:
            parameters['language_model'] = False
        # Load reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]
        pos_to_id, ortho_to_id, segment_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_pos, model.id_to_ortho, model.id_to_segment]]
        word_to_id_1 = {v: k for k, v in model.id_to_word_1.items()}
        # Load the model
        _, f_eval = model.build(training=False, **parameters)
        model.reload()
        id_to_tag = model.id_to_tag
    else:
        # load mappings
        mappings = read_pickle(join_path(model_path, 'mappings.pkl'))
        id_to_word = mappings['id_to_word']
        id_to_char = mappings['id_to_char']
        id_to_tag = mappings['id_to_tag']
        id_to_pos = mappings['id_to_pos']
        id_to_ortho = mappings['id_to_ortho']
        id_to_segment = mappings['id_to_segment']
        id_to_word_1 = mappings['id_to_word_1']
        # reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [id_to_word, id_to_char, id_to_tag]]
        pos_to_id, ortho_to_id, segment_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [id_to_pos, id_to_ortho, id_to_segment]]
        word_to_id_1 = {v: k for k, v in id_to_word_1.items()}
    predictions = {}
    docs_count = 0
    for doc_name, sentences in documents.items():
        for sentence in sentences:
            words = [span['word'] for span in sentence]
            start = [span['start'] for span in sentence]
            end = [span['end'] for span in sentence]
            pos = [span['pos'] for span in sentence]
            ortho = [get_ortho_feature(w) for w in words]
            doc_names = [doc_name] * len(words)
            input_dict = {
                'words': words,
                'pos': pos,
                'ortho': ortho,
                'doc_names': doc_names
            }
            sentence_cl = ' '.join(words)
            if parameters['lower']:
                sentence_cl = sentence_cl.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                sentence_cl = zero_digits(sentence_cl)
            words = sentence_cl.split(' ')
            assert len(words) == len(start) == len(end)
            # Prepare input
            sentence = prepare_sentence(input_dict,
                                        word_to_id,
                                        char_to_id,
                                        pos_to_id,
                                        ortho_to_id,
                                        segment_to_id,
                                        word_to_id_1,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, add_label=False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            if not return_raw_predictions:
                y_preds = resolve_inconsistencies(y_preds)
                entities = extract_entities(words, y_preds, start, end)
                if doc_name not in predictions:
                    predictions[doc_name] = []
                if len(entities) > 0:
                    predictions[doc_name] += entities
            else:
                if doc_name not in predictions:
                    predictions[doc_name] = {}
                    predictions[doc_name]['words'] = []
                    predictions[doc_name]['tags'] = []
                    predictions[doc_name]['start'] = []
                    predictions[doc_name]['end'] = []
                predictions[doc_name]['words'].append(words)
                predictions[doc_name]['tags'].append(y_preds)
                predictions[doc_name]['start'].append(start)
                predictions[doc_name]['end'].append(end)
        docs_count += 1
        if docs_count % 100 == 0:
            print('{} documents processed'.format(docs_count))

    if return_raw_predictions:
        return predictions
    else:
        write_predictions(output_path, predictions)