def test_ner(crf, test_sent): from tokenizer.tokenizer import Tokenizer token = Tokenizer() token.run() arr_featurized_sent = [] postaged_sent = ViPosTagger.postagging(token.predict(test_sent)) print postaged_sent test_arr = [] for i in xrange(len(postaged_sent[0])): test_arr.append((postaged_sent[0][i], postaged_sent[1][i])) print test_arr featurized_sent = sent2features(test_arr) arr_featurized_sent.append(featurized_sent) predict = crf.predict(arr_featurized_sent) return zip(test_arr, predict[0])
def first_stats(): tokenizer = Tokenizer() tokenizer.run() question_vocabulary = Vocabulary() questions = load_questions() cc = 0 for question in questions: #print question if cc % 10 == 0: print "\r%s" % cc, cc += 1 sen = tokenizer.predict(question) sen = sen.lower() tokens = question_vocabulary.get_sentence_token_ids(sen) question_list.append(tokens) print "\n Saving..." question_vocabulary.save(Q_VOCAB_NAME) utils.pickle_save(question_list, "question_tokens.dat") print "Done"
# -*- encoding: utf8 -*- import re import requests import unicodedata from tokenizer.tokenizer import Tokenizer from sklearn.externals import joblib import pandas as pd import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import f1_score from sklearn.svm import SVC from pyvi.pyvi import ViTokenizer from sklearn.metrics import confusion_matrix tokenizer = Tokenizer() tokenizer.run() def load_model(model): print('loading model ...', model) if os.path.isfile(model): return joblib.load(model) else: return None def list_words(mes): words = mes.lower().split() return " ".join(words)