def predict_(sentence): words = extract_cn_jd(sentence).split(' ') seq = doc2matrix(words, maxlen) arr = np.array(seq) arr = arr.reshape((1, maxlen, word2vec_dim)) return model.predict(arr)
pos = pd.read_table(path.join(path.dirname(__file__), '..', 'data', 'pos_sim.txt'), header=None, sep='\n', encoding='utf8') pos['label'] = 1 neg = pd.read_table(path.join(path.dirname(__file__), '..', 'data', 'neg_sim.txt'), header=None, sep='\n', encoding='utf8') neg['label'] = 0 all_ = pos.append(neg, ignore_index=True) all_['words'] = all_[0].apply(lambda s: extract_cn_jd(s).split(' ')) #调用结巴分词 print(all_['words']) w2v_model = KeyedVectors.load_word2vec_format(path.join( path.dirname(__file__), '..', 'data', 'w2v_onlycn_100_c_2.bin'), binary=True, unicode_errors='ignore') word2vec_dim = 100 maxlen = 100 #截断词数 content = [] for word_list in all_['words']: for word in word_list: try: vec = w2v_model[word] content.append(word)