def run(self): clause_df = pd.DataFrame(self.clauses) print("after:::", clause_df.head(5)) # change column headers, for processing by Siamese-LSTM clause_df.columns = ['no', 'question1', 'question2'] for q in ['question1', 'question2']: clause_df[q + '_n'] = clause_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 clause_df, embeddings = make_w2v_embeddings( clause_df, embedding_dim=embedding_dim, empty_w2v=False) # Split to dicts and append zero padding. X_test = split_and_zero_padding(clause_df, max_seq_length) # Make sure everything is ok assert X_test['left'].shape == X_test['right'].shape model = tf.keras.models.load_model('./data/keras_model/SiameseLSTM.h5', custom_objects={'ManDist': ManDist}) model.summary() # prediction = model.predict([X_test['left'], X_test['right']]) prediction = model.predict([X_test['left'], X_test['right']], verbose=1) print(prediction) # zip section header w/ model-prediction, Ex: 'Section 1 : 0.54' result = zip([x[0] for x in self.clauses], prediction.tolist()) return result
def compare_si(self, senti, input_sentence): if senti == 0: datafile = self.data_directory / 'yelp_0.txt' elif senti == 1: datafile = self.data_directory / 'yelp_1.txt' with open(datafile) as f: data = f.readlines() for i in range(len(data)): data[i] = data[i].replace('.', '').replace('\n', '').replace('!', '') result_index = [] test_sentence_pairs = [] for i in range(len(data)): test_sentence = (input_sentence, data[i]) test_sentence_pairs.append(test_sentence) embedding_dict = {} test_df = pd.DataFrame(test_sentence_pairs, columns=['question1', 'question2']) for q in ['question1', 'question2']: test_df[q + '_n'] = test_df[q] test_df, embeddings = make_w2v_embeddings(embedding_dict, test_df, embedding_dim=300) X_test = split_and_zero_padding(test_df, 10) assert X_test['left'].shape == X_test['right'].shape preds = list( self.model_similarity.predict([X_test['left'], X_test['right']])) results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)] results.sort(key=itemgetter(2), reverse=True) return results[0:3]
# 是否启用预训练的词向量,默认使用随机初始化的词向量 o = input("type yes or no for choosing pre-trained w2v or not:") if o == 'yes': # 加载词向量 print("Loading word2vec model(it may takes 2-3 mins) ...") embedding_dict = KeyedVectors.load_word2vec_format(embedding_path, binary=True) else: embedding_dict = {} # 读取并加载训练集 train_df = pd.read_csv(TRAIN_CSV) for q in ['question1', 'question2']: train_df[q + '_n'] = train_df[q] # 将训练集词向量化 train_df, embeddings = make_w2v_embeddings(flag, embedding_dict, train_df, embedding_dim=embedding_dim) ''' 把训练数据从: question1 question2 is_duplicate 借 呗 还款 信息 借 呗 还款 日期 0 变成: question1 question2 is_duplicate question1_n question2_n 借 呗 还款 信息 借 呗 还款 日期 0 借 呗 还款 信息 借 呗 还款 日期 变成id以后: question1 question2 is_duplicate question1_n question2_n 借 呗 还款 信息 借 呗 还款 日期 0 [31, 639] [31, 255] ''' # 分割训练集
from util import make_w2v_embeddings from util import split_and_zero_padding from util import ManDist # File paths TEST_CSV = './data/test.csv' # Load training set test_df = pd.read_csv(TEST_CSV) for q in ['question1', 'question2']: test_df[q + '_n'] = test_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 test_df, embeddings = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=False) # Split to dicts and append zero padding. X_test = split_and_zero_padding(test_df, max_seq_length) # Make sure everything is ok assert X_test['left'].shape == X_test['right'].shape # -- model = tf.keras.models.load_model('./data/SiameseLSTM.h5', custom_objects={'ManDist': ManDist}) model.summary() prediction = model.predict([X_test['left'], X_test['right']]) print(prediction)
# File paths TRAIN_CSV = './data/train.csv' # Load training set train_df = pd.read_csv(TRAIN_CSV) for q in ['question1', 'question2']: train_df[q + '_n'] = train_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 use_w2v = True train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v) # Split to train validation validation_size = int(len(train_df) * 0.1) training_size = len(train_df) - validation_size X = train_df[['question1_n', 'question2_n']] Y = train_df['is_duplicate'] X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size) X_train = split_and_zero_padding(X_train, max_seq_length) X_validation = split_and_zero_padding(X_validation, max_seq_length)
from util import ManDist # File paths TEST_CSV = './data/test-20.csv' EMBEDDING_FILE = './data/GoogleNews-vectors-negative300.bin.gz' # Load training set test_df = pd.read_csv(TEST_CSV) for q in ['question1', 'question2']: test_df[q + '_n'] = test_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 test_df, embeddings = make_w2v_embeddings(test_df, file=EMBEDDING_FILE, embedding_dim=embedding_dim, empty_w2v=False) # Split to dicts and append zero padding. X_test = split_and_zero_padding(test_df, max_seq_length) # Make sure everything is ok assert X_test['left'].shape == X_test['right'].shape # -- model = tf.keras.models.load_model('./data/malstm.h5', custom_objects={'ManDist': ManDist}) model.summary() prediction = model.predict([X_test['left'], X_test['right']])
# Parameters max_features = 5000 maxlen = 50 gpus = 1 batch_size = 1024 * gpus embedding_dims = 300 epochs = 10 print('Loading data...') DATA_FILE = "~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv" # DATA_FILE = "../uci-news-aggregator.csv" df = pd.read_csv(DATA_FILE) df['TITLE_n'] = df['TITLE'] df, embeddings = make_w2v_embeddings(df, embedding_dim=embedding_dims, empty_w2v=True) y = OneHotEncoder().fit_transform(LabelEncoder().fit_transform( df['CATEGORY']).reshape(-1, 1)).toarray() x_train, x_test, y_train, y_test = train_test_split(df['TITLE_n'], y, test_size=0.1) # (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print(len(embeddings), 'embeddings input_dim') print('Pad sequences (samples x time)')
'question1': ["".join(sen1)], 'question2': ["".join(sen2)] }) dataframe.to_csv("./data/test.csv", index=False, sep=',', encoding='utf-8') TEST_CSV = './data/test.csv' # 读取并加载测试集 test_df = pd.read_csv(TEST_CSV) for q in ['question1', 'question2']: test_df[q + '_n'] = test_df[q] # 将测试集词向量化 test_df, embeddings = make_w2v_embeddings(flag, embedding_dict, test_df, embedding_dim=embedding_dim) # 预处理 X_test = split_and_zero_padding(test_df, max_seq_length) # 确认数据准备完毕且正确 assert X_test['left'].shape == X_test['right'].shape # 预测并评估准确率 prediction = model.predict([X_test['left'], X_test['right']]) print(prediction)
for q in ['question1', 'question2']: train_df[q + '_n'] = train_df[q] test_df[q + '_n'] = test_df[q] test_df = test_df[train_df.columns] use_w2v = True print('-------------') # print(train_df.head()) # print(test_df.head()) train_size= train_df.shape[0] print('train size: {}'.format(train_size)) print('-------------') if BUILD_EMBED == True: full_df = train_df.append(test_df, ignore_index=True) full_df, embeddings = make_w2v_embeddings(full_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v) print("sentences embedded") else: # full_df= pd.read_csv('./data/full_embeddings_A1.csv') # embeddings = np.load('./data/embeddings/embedding_matrix_A1.npy') print('embeddings loaded') train_df = full_df.iloc[:train_size].copy() test_df = full_df.iloc[train_size:].copy() print('--------------------------') # print(train_df.head()) # print(test_df.head()) print('--------------------------') # test_df, embeddingsx = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v) # print("sentences embedded") # test_df.to_csv('./data/test_embeddings.csv', index= False)
del word2vec return df, embeddings df_ = pd.DataFrame([[ "What are the best career growth technologies for automation engineers apart from automation tools?", "Himalayan or Duke KTM 200 for touring?" ]], columns=["question1", "question2"]) for q in ['question1', 'question2']: df_[q + '_n'] = df_[q] df_.head() train_df, embeddings = make_w2v_embeddings(word2vec=embeddings, df=df_, embedding_dim=embedding_dim) split_df = split_and_zero_padding(train_df, max_seq_length) print(split_df) # In[15]: assert split_df['left'].shape == split_df['right'].shape # In[16]: def find_similar_sentence(user_input): is_duplicate = model.predict([split_df['left'], split_df['right']]) return is_duplicate
# File paths # TRAIN_CSV = './data/train.csv' TRAIN_CSV = './data/quora.csv' # Load training set train_df = pd.read_csv(TRAIN_CSV) for q in ['question1', 'question2']: train_df[q + '_n'] = train_df[q] # Make word2vec embeddings embedding_dim = 300 max_seq_length = 20 use_w2v = True train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim) # Split to train validation validation_size = int(len(train_df) * 0.1) training_size = len(train_df) - validation_size X = train_df[['question1_n', 'question2_n']] Y = train_df['is_duplicate'] X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size) X_train = split_and_zero_padding(X_train, max_seq_length) X_validation = split_and_zero_padding(X_validation, max_seq_length) # Convert labels to their numpy representations