def make_tfidf_combined_feature_cos_100_holdout(row_body_path, row_stance_path, row_test_body_path, row_test_stance_path, head_pkl, body_pkl, label_path, tfidf_cos_path): print(tfidf_cos_path) if not os.path.exists(body_pkl) or not os.path.exists(head_pkl) or \ not os.path.exists(label_path) or not os.path.exists(tfidf_cos_path): print('.pkl files not exist. We will make new TF-IDF .pkl vectors') make_tfidf_cos_feature_100_holdout(row_body_path, row_stance_path, row_test_body_path, row_test_stance_path, head_pkl, body_pkl, label_path, tfidf_cos_path) print('.pkl make finish!') X_train_body = load_model(body_pkl) X_train_head = load_model(head_pkl) X_train_cos = load_model(tfidf_cos_path) print('shape : ', X_train_body.shape, X_train_head.shape) print(X_train_cos.shape) X_train = np.concatenate((X_train_head.toarray(), X_train_body.toarray()), axis=1) X_train = np.concatenate((X_train, X_train_cos), axis=1) return X_train
def make_NMF_300_feature(row_body_path, row_stance_path, head_tfidf_pkl, body_tfidf_pkl, label_path, save_nmf_model_path, save_head_path, save_body_path, cos_dist=False): if not os.path.exists(head_tfidf_pkl) or not os.path.exists(body_tfidf_pkl) \ or not os.path.exists(label_path): make_tfidf_feature_5000(row_body_path, row_stance_path, head_tfidf_pkl, body_tfidf_pkl, label_path, model_save=True) X_tfidf_body = load_model(body_tfidf_pkl) X_tfidf_head = load_model(head_tfidf_pkl) if not os.path.exists(save_nmf_model_path): X_all = np.concatenate( (X_tfidf_head.toarray(), X_tfidf_body.toarray()), axis=0) print('fit NMF topic model') t0 = time() nmf = NMF(n_components=300, random_state=1, alpha=.1) nmf.fit(X_all) print('done in {}'.format(time() - t0)) save_model(save_nmf_model_path, nmf) nmf = load_model(save_nmf_model_path) if not os.path.exists(save_head_path) or not os.path.exists( save_body_path): nmf_head_matrix = nmf.transform(X_tfidf_head) nmf_body_matrix = nmf.transform(X_tfidf_body) save_model(save_head_path, nmf_head_matrix) print('saved model {}'.format(save_head_path)) save_model(save_body_path, nmf_body_matrix) print('saved model {}'.format(save_body_path)) nmf_head_matrix = load_model(save_head_path) nmf_body_matrix = load_model(save_body_path) if not cos_dist: return np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1) else: X = [] for i in range(len(nmf_head_matrix)): X_head = np.array(nmf_head_matrix[i]).reshape((1, -1)) X_body = np.array(nmf_body_matrix[i]).reshape((1, -1)) cos = cosine_distances(X_head, X_body).flatten() X.append(cos.tolist()) X = np.array(X) X_train = np.concatenate((nmf_head_matrix, nmf_body_matrix), axis=1) X = np.concatenate((X_train, X), axis=1) return X
def get_tfidf_vocab_5000_holdout(self, test_body, test_stance): """ TF-IDF 벡터를 만들기 위한 train_vocab 파일을 반환 하는 메소드 :return: train용 TF-IDF vocab 파일 """ test_dataset = Dataset(test_body, test_stance) t_h, t_b = test_dataset.read_tfidf_data() test_h = [h for h in t_h] test_b = [b for b in t_b] train_data = [b + " " + h for b, h in zip(self.body, self.head)] train_data.extend(test_b) train_data.extend(test_h) model = TfidfVectorizer(max_features=5000, ngram_range=(1, 1), stop_words='english', norm='l2', use_idf=False) model.fit_transform(train_data) if os.path.exists('../pickled_model/tfidf_holdout_vocab.pkl'): self.vocab = load_model('../pickled_model/tfidf_holdout_vocab.pkl') print('vocab loaded!') else: self.vocab = model.vocabulary_ save_model('../pickled_model/tfidf_holdout_vocab.pkl', model.vocabulary_) return self.vocab
def tfidf_cos_save(self, head_path, body_path, filename, model_save=False): head = load_model(head_path).toarray() body = load_model(body_path).toarray() cos = [] for x, y in zip(head, body): x = x.reshape(1, -1) y = y.reshape(1, -1) value = cosine_similarity(x, y)[0] cos.append(value) cos = np.array(cos) if model_save: saved_path = self.save_path + "/" + filename print('tfidf_cos saving......') save_model(saved_path, cos) print('feature saving finished!') print('saved path : ', saved_path) else: return cos
def load_tfidf_y(pkl_path): return load_model(pkl_path)
mode = 'test' # mode='fold_test' embedding = np.load(base_feat_path + "/single_flat_LSTM_50d_100_embedding.npy") dense_size = 600 dropout_keep_prob = 0.8 fold_size = 5 # if mode == 'train': X_train_seq = get_sequence_data(train_body, train_stance, 'train') X_train_feat = make_tfidf_combined_feature_cos_100(train_body, train_stance, train_head_dir, train_body_dir, train_label, train_cos_dir) y_train = load_model(train_label) data_shuffle(X_train_seq, seed=12345) data_shuffle(X_train_feat, seed=12345) data_shuffle(y_train, seed=12345) # print(X_train_seq[:2]) # print(y_train[:10]) input_len = len(X_train_feat[0]) # elif mode == 'test': # X_test_seq = get_sequence_data(test_body, test_stance, 'test') # X_test_feat = make_tfidf_combined_feature_cos_100(test_body, test_stance, test_head_dir, test_body_dir, # test_label, test_cos_dir) # y_test = load_model(test_label) # # print(X_test_seq[:2]) # input_len = len(X_test_feat[0]) graph = tf.Graph()
+str(training_epoch)+"_batch"+str(batch_size)+"_holdout" # mode = 'train' mode = 'test' # mode='fold_test' embedding = np.load(base_feat_path+"/single_flat_LSTM_50d_100_embedding.npy") dense_size = 600 dropout_keep_prob = 0.8 fold_size = 5 if mode == 'train': X_train_seq = get_sequence_data(train_body, train_stance, 'train') X_train_feat = make_tfidf_combined_feature_cos_100_holdout(train_body, train_stance, test_body, test_stance, train_head_dir, train_body_dir, train_label, train_cos_dir) y_train = load_model(train_label) data_shuffle(X_train_seq, seed=12345) data_shuffle(X_train_feat, seed=12345) data_shuffle(y_train, seed=12345) input_len = len(X_train_feat[0]) elif mode == 'test': X_test_seq = get_sequence_data(test_body, test_stance, 'test') X_test_feat = make_tfidf_combined_feature_cos_100_holdout(test_body, test_stance, train_body, train_stance, test_head_dir, test_body_dir, test_label, test_cos_dir) y_test = load_model(test_label) input_len = len(X_test_feat[0]) graph = tf.Graph() with graph.as_default(): X_seq = tf.placeholder(tf.int32, [None, seq_len])