コード例 #1
0
ファイル: train.py プロジェクト: Xeclipse/ATEC_nul_sim
def test():
    corpus = tp.loadPickle(corpus_save_file)
    test_corpus = corpus[-2000:-1]
    test_X = tp.loadPickle('./test_X')
    test_Y = tp.loadPickle('./test_Y')
    pred = predict(test_X, test_Y)
    pred = [np.argmax(i) for i in pred]
    label = [np.argmax(i) for i in test_Y]
    error_analysis(pred, label, test_corpus)
コード例 #2
0
ファイル: dataprocess.py プロジェクト: Xeclipse/ATEC_nul_sim
def output_neg_pos():
    pairs = tp.loadPickle(corpus_save_file)
    f1 = open('./pos_case', 'w')
    f2 = open('./neg_case', 'w')
    for pair in pairs:
        if pair.label == '1':
            f1.write(pair.__str__() + '\n')
        else:
            f2.write(pair.__str__() + '\n')
    f1.close()
    f2.close()
コード例 #3
0
def gen_templates():
    tokenizer = tp.loadPickle('../ATEC/tokenizer')
    f = Template_Finder(tokenizer.tokenize, window=3)
    f.train(F())
    f.find(F())
    templates = f.templates
    templates = {i: j for i, j in templates.items() if not i.is_trivial()}
    trie = XTrie()
    for i, j in templates.items():
        trie[tuple(i.words)] = j
    tp.savePickle(trie, '../ATEC/templates')
コード例 #4
0
ファイル: dataprocess.py プロジェクト: Xeclipse/ATEC_nul_sim
def output_pari(file):
    corpus = tp.loadPickle(corpus_save_file)
    with open(file, 'w') as f:
        for pair in corpus:
            f.write(pair.first_sen + '\n')
            f.write(pair.second_sen + '\n')
コード例 #5
0
ファイル: dataprocess.py プロジェクト: Xeclipse/ATEC_nul_sim
from collections import Counter
from utils.structures import TrieTree, TrieTreeNode
import utils.textProcess as tp
from utils.nlp_zero import Tokenizer
# jieba.add_word("花呗", freq=100000)
# jieba.add_word("借呗", freq=100000)
# jieba.add_word("外卖", freq=100000)
# jieba.add_word("闲鱼", freq=100000)
# jieba.add_word("更改", freq=100000)

csv_file = './data/atec_nlp_sim_train.csv'
csv_file2 = './data/atec_nlp_sim_train_add.csv'
word_dic_file = './data/processed_data/word_dic.dic'
char_dic_file = './data/processed_data/char_dic.dic'
corpus_save_file = './data/processed_data/corpus'
tokenizer= tp.loadPickle('./tokenizer')

class Pair:
    def __init__(self):
        self.id = -1
        self.first_sen = None
        self.second_sen = None
        self.label = 0

    def __init__(self, id, sen1, sen2, label):
        self.id = id
        self.first_sen = sen1.replace('***', '&')
        self.second_sen = sen2.replace('***', '&')
        self.label = label

    def cut_word(self):
コード例 #6
0
                fout.write(u' '.join(res).encode('utf-8') + '\n')


def gen_templates():
    tokenizer = tp.loadPickle('../ATEC/tokenizer')
    f = Template_Finder(tokenizer.tokenize, window=3)
    f.train(F())
    f.find(F())
    templates = f.templates
    templates = {i: j for i, j in templates.items() if not i.is_trivial()}
    trie = XTrie()
    for i, j in templates.items():
        trie[tuple(i.words)] = j
    tp.savePickle(trie, '../ATEC/templates')


if __name__ == '__main__':
    # gen_templates()
    tokenizer = tp.loadPickle('../ATEC/tokenizer')
    templates = tp.loadPickle('../ATEC/templates')
    start = time.time()
    p = Parser(templates, tokenizer.tokenize)
    tree1 = p.parse(u'我的花呗是以前的手机号码,怎么更改成现在的支付宝的号码手机号').plot()
    tree2 = p.parse(u'怎么更改花呗手机号码').plot()

    end = time.time()
    print tree1
    print tree2
    print(end - start)

# tp.savePickle(templates, './data/templates')
コード例 #7
0
ファイル: train.py プロジェクト: Xeclipse/ATEC_nul_sim
    return acc, precision, recall, f1


def extra_train_data(train_X, train_Y, count=100000):
    length = len(train_X)
    for i in range(count):
        f = random.randint(0, length)
        s = random.randint(0, length)
        if f != s:
            train_X.append([train_X[f][0], train_X[s][0]])
            train_Y.append([1.0, 0.0])
    return train_X, train_Y


# corpus = tp.loadPickle(corpus_save_file)
index_dic = tp.loadPickle(char_dic_file)

# train_X, train_Y = build_train_set()
# train_Y = [[i] for i in train_Y]
# train_Y = tp.oneHotLabels(train_Y, 1)
# test_X = train_X[-2000:-1]
# test_Y = train_Y[-2000:-1]
# train_X = train_X[0:-2000]
# train_Y = train_Y[0:-2000]
# tp.savePickle(train_X,'./train_X')
# tp.savePickle(train_Y,'./train_Y')
# tp.savePickle(test_X,'./test_X')
# tp.savePickle(test_Y,'./test_Y')


def train():