Ejemplo n.º 1
0
    def score_it(self, bigram, incoming_char):
        feature = feature_gen(bigram, incoming_char)
        raw_score = self.eval([u_str.encode('utf-8') for u_str in feature],
                              "T")  # maxent_model only takes utf-8 string as input
        #print '\t\tRaw score=',raw_score

        return math.log(raw_score, 10)
def gen_instance_by_traversal_lattice(valid_state, backward_lattice, sent, dummy_end):
    instance = []

    display_flag = False

    if display_flag: print "instance gen..."

    for i in range(1, len(backward_lattice) + 1):

        incoming_char = get_incoming_char(sent, i, dummy_end)

        if display_flag:  print '\n\ni=', i

        cached_bigram = backward_lattice[i - 1]

        for j in cached_bigram:

            if display_flag: print '\tj=', j

            if j == 0:
                label = u"F"

                if (cached_bigram[0], incoming_char) in valid_state:
                    label = u"T"

                feature = feature_gen(cached_bigram[0], incoming_char)
                instance.append((label, feature))

                if display_flag: print '\t## label/bigram/incoming_char=', label, u"-".join(cached_bigram[0]) \
                    , incoming_char, 'feature=', u"/".join(feature)

            else:

                for k in cached_bigram[j]:
                    bigram = cached_bigram[j][k]

                    label = u"F"
                    if (bigram, incoming_char) in valid_state:
                        label = u"T"

                    feature = feature_gen(bigram, incoming_char)
                    instance.append((label, feature))

                    if display_flag: print '\t\tk=', k, 'label/bigram/incoming_char=', label, u"-".join(
                        bigram), incoming_char, 'feature=', u"/".join(feature)

    return instance
Ejemplo n.º 3
0
    def score_it(self, bigram, incoming_char):
        feature = feature_gen(bigram, incoming_char)
        raw_score = self.eval(
            [u_str.encode('utf-8') for u_str in feature],
            "T")  # maxent_model only takes utf-8 string as input
        #print '\t\tRaw score=',raw_score

        return math.log(raw_score, 10)
Ejemplo n.º 4
0
def score_it(bigram, incoming_char):
    #return -1.0

    feature = feature_gen(bigram, incoming_char)
    print "feature:", u" ".join(feature)
    score = -len(u"".join(feature))

    return score
Ejemplo n.º 5
0
def score_it(bigram, incoming_char):
    # return -1.0

    feature = feature_gen(bigram, incoming_char)
    print "feature:", u" ".join(feature)
    score = -len(u"".join(feature))

    return score
Ejemplo n.º 6
0
    print(clf.best_estimator_)  #print the best estimator
    print(clf.score(X_test, y_test))  #print accuracy performed on the test set


l_1mer = list_1mer()
l_2mer = dict_2mer(l_1mer)
word_train = open(r'./temp/word_train.txt', mode='r', encoding='utf-8')
pinyin_train = open(r'./temp/pinyin_train.txt', mode='r', encoding='utf-8')
Data_Feature = []
Data_Label = []

content1 = word_train.readlines()
for i in getrandom(
        len(content1),
        2000):  #take 2000 word samples randomly for searching best paramaters
    Data_Feature.append(feature_gen(l_2mer, content1[i]))
    Data_Label.append(1)  #data_label of word is 1

content2 = pinyin_train.readlines()
for j in getrandom(
        len(content2), 2000
):  #take 2000 pinyin samples randomly for searching best paramaters
    Data_Feature.append(feature_gen(l_2mer, content2[j]))
    Data_Label.append(0)  #data_label of word is 0

SVC_search(
    Data_Feature,
    Data_Label)  #training various estimators and searching best paramaters

print('Program running time:')
end = time.process_time()
Ejemplo n.º 7
0
from feature_gen import feature_gen

start = time.process_time()

l_1mer = list_1mer()
l_2mer = dict_2mer(l_1mer)
classifile = open(r'./input/tokens.txt', mode='r', encoding='utf-8')
pinyin = open(r'./output/pinyin.txt', mode='w', encoding='utf-8')
words = open(r'./output/words.txt', mode='w', encoding='utf-8')

Data_Feature = []
clf_new = joblib.load(
    r'./model/clf.pkl')  #load trained SVM classifer clf.pkl as clf_new
content = classifile.readlines()
for line in content:
    Data_Feature.append(feature_gen(
        l_2mer, line))  #generate data_feature for tokens.txt

Data_Label = clf_new.predict(
    Data_Feature)  #predict strings in tokens.txt as word or pinyin
#Data_Proba = clf_new.predict_proba(Data_Feature)  #view the probabilities of prediction
for i in range(len(Data_Label)):
    if Data_Label[i] == 1:
        words.write(content[i])
        #words.write(content[i].strip() + ' ' + str(Data_Proba[i]) + '\n')  #write the probabilities of prediction into words.txt as well
    if Data_Label[i] == 0:
        pinyin.write(content[i])
        #pinyin.write(content[i].strip() + ' ' + str(Data_Proba[i]) + '\n')  #write the probabilities of prediction into pinyin.txt as well

classifile.close()
pinyin.close()
words.close()
Ejemplo n.º 8
0
from feature_gen import feature_gen
from mln_generator import mln_generator
import sys

fg = feature_gen()
fg.generate_features()