Beispiel #1
0
def predict_(sentence):
    words = extract_cn_jd(sentence).split(' ')
    seq = doc2matrix(words, maxlen)
    arr = np.array(seq)
    arr = arr.reshape((1, maxlen, word2vec_dim))
    return model.predict(arr)
Beispiel #2
0
pos = pd.read_table(path.join(path.dirname(__file__), '..', 'data',
                              'pos_sim.txt'),
                    header=None,
                    sep='\n',
                    encoding='utf8')
pos['label'] = 1
neg = pd.read_table(path.join(path.dirname(__file__), '..', 'data',
                              'neg_sim.txt'),
                    header=None,
                    sep='\n',
                    encoding='utf8')
neg['label'] = 0
all_ = pos.append(neg, ignore_index=True)

all_['words'] = all_[0].apply(lambda s: extract_cn_jd(s).split(' '))  #调用结巴分词
print(all_['words'])
w2v_model = KeyedVectors.load_word2vec_format(path.join(
    path.dirname(__file__), '..', 'data', 'w2v_onlycn_100_c_2.bin'),
                                              binary=True,
                                              unicode_errors='ignore')
word2vec_dim = 100

maxlen = 100  #截断词数

content = []
for word_list in all_['words']:
    for word in word_list:
        try:
            vec = w2v_model[word]
            content.append(word)