def run(self): clause_df = pd.DataFrame(self.clauses) print("after:::", clause_df.head(5)) # change column headers, for processing by Siamese-LSTM clause_df.columns = ['no', 'question1', 'question2'] for q in ['question1', 'question2']: clause_df[q + '_n'] = clause_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 clause_df, embeddings = make_w2v_embeddings( clause_df, embedding_dim=embedding_dim, empty_w2v=False) # Split to dicts and append zero padding. X_test = split_and_zero_padding(clause_df, max_seq_length) # Make sure everything is ok assert X_test['left'].shape == X_test['right'].shape model = tf.keras.models.load_model('./data/keras_model/SiameseLSTM.h5', custom_objects={'ManDist': ManDist}) model.summary() # prediction = model.predict([X_test['left'], X_test['right']]) prediction = model.predict([X_test['left'], X_test['right']], verbose=1) print(prediction) # zip section header w/ model-prediction, Ex: 'Section 1 : 0.54' result = zip([x[0] for x in self.clauses], prediction.tolist()) return result
def compare_si(self, senti, input_sentence): if senti == 0: datafile = self.data_directory / 'yelp_0.txt' elif senti == 1: datafile = self.data_directory / 'yelp_1.txt' with open(datafile) as f: data = f.readlines() for i in range(len(data)): data[i] = data[i].replace('.', '').replace('\n', '').replace('!', '') result_index = [] test_sentence_pairs = [] for i in range(len(data)): test_sentence = (input_sentence, data[i]) test_sentence_pairs.append(test_sentence) embedding_dict = {} test_df = pd.DataFrame(test_sentence_pairs, columns=['question1', 'question2']) for q in ['question1', 'question2']: test_df[q + '_n'] = test_df[q] test_df, embeddings = make_w2v_embeddings(embedding_dict, test_df, embedding_dim=300) X_test = split_and_zero_padding(test_df, 10) assert X_test['left'].shape == X_test['right'].shape preds = list( self.model_similarity.predict([X_test['left'], X_test['right']])) results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) return results[0:3]
借 呗 还款 信息 借 呗 还款 日期 0 变成: question1 question2 is_duplicate question1_n question2_n 借 呗 还款 信息 借 呗 还款 日期 0 借 呗 还款 信息 借 呗 还款 日期 变成id以后: question1 question2 is_duplicate question1_n question2_n 借 呗 还款 信息 借 呗 还款 日期 0 [31, 639] [31, 255] ''' # 分割训练集 X = train_df[['question1_n', 'question2_n']] Y = train_df['is_duplicate'] X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.1) X_train = split_and_zero_padding(X_train, max_seq_length) X_validation = split_and_zero_padding(X_validation, max_seq_length) # 将标签转化为数值 Y_train = Y_train.values Y_validation = Y_validation.values # 确认数据准备完毕且正确 assert X_train['left'].shape == X_train['right'].shape assert len(X_train['left']) == len(Y_train) # -----------------基础函数------------------ # def shared_model(_input): # 词向量化
positive_indexes = [] for i, _is_true in enumerate(is_true): if _is_true: positive_indexes.append(i) positive_indexes.append(len(is_true)) print("original dataframe size", demo_df.shape) for q in ['question1', 'question2']: demo_df[q + '_n'] = demo_df[q] # Make word2vec embeddings max_seq_length = 20 test_df, _ = load_embedding_and_vectorize(demo_df) # Split to dicts and append zero padding. X_test = split_and_zero_padding(test_df, max_seq_length) # Make sure everything is ok assert X_test['left'].shape == X_test['right'].shape # -- print(np.array(test_df["is_duplicate"]).shape) print(X_test['left'].shape, X_test['right'].shape) with tf.device('/device:GPU:{}'.format(params.gpu)): model = tf.keras.models.load_model('./models/SiameseLSTM.h5', custom_objects={'ManDist': ManDist}) model.summary() print( model.evaluate([X_test['left'], X_test['right']], batch_size=512,
return df, embeddings df_ = pd.DataFrame([[ "What are the best career growth technologies for automation engineers apart from automation tools?", "Himalayan or Duke KTM 200 for touring?" ]], columns=["question1", "question2"]) for q in ['question1', 'question2']: df_[q + '_n'] = df_[q] df_.head() train_df, embeddings = make_w2v_embeddings(word2vec=embeddings, df=df_, embedding_dim=embedding_dim) split_df = split_and_zero_padding(train_df, max_seq_length) print(split_df) # In[15]: assert split_df['left'].shape == split_df['right'].shape # In[16]: def find_similar_sentence(user_input): is_duplicate = model.predict([split_df['left'], split_df['right']]) return is_duplicate # In[17]: