Exemple #1
0
def test_ner(crf, test_sent):
    from tokenizer.tokenizer import Tokenizer
    token = Tokenizer()
    token.run()
    arr_featurized_sent = []
    postaged_sent = ViPosTagger.postagging(token.predict(test_sent))
    print postaged_sent
    test_arr = []
    for i in xrange(len(postaged_sent[0])):
        test_arr.append((postaged_sent[0][i], postaged_sent[1][i]))
    print test_arr
    featurized_sent = sent2features(test_arr)
    arr_featurized_sent.append(featurized_sent)
    predict = crf.predict(arr_featurized_sent)
    return zip(test_arr, predict[0])
def first_stats():
    tokenizer = Tokenizer()
    tokenizer.run()
    question_vocabulary = Vocabulary()

    questions = load_questions()
    cc = 0
    for question in questions:
        #print question
        if cc % 10 == 0:
            print "\r%s" % cc,
        cc += 1
        sen = tokenizer.predict(question)
        sen = sen.lower()
        tokens = question_vocabulary.get_sentence_token_ids(sen)
        question_list.append(tokens)
    print "\n Saving..."
    question_vocabulary.save(Q_VOCAB_NAME)
    utils.pickle_save(question_list, "question_tokens.dat")

    print "Done"
# -*- encoding: utf8 -*-
import re
import requests
import unicodedata
from tokenizer.tokenizer import Tokenizer
from sklearn.externals import joblib
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from pyvi.pyvi import ViTokenizer
from sklearn.metrics import confusion_matrix

tokenizer = Tokenizer()
tokenizer.run()


def load_model(model):
    print('loading model ...', model)
    if os.path.isfile(model):
        return joblib.load(model)
    else:
        return None


def list_words(mes):
    words = mes.lower().split()
    return " ".join(words)