def read_dataset_train_head_body(datapath): head_train, body_train = get_head_body_tuples(data_path=datapath) dataset = {} idx = 0 for head, body in zip(head_train, body_train): dataset[idx] = {'head': head, 'body': body} idx += 1 return dataset
pkl.dump(combined, open(combined_file_name, 'wb'), pkl.HIGHEST_PROTOCOL) print('Saving finish Count head body vectors') print('Head : {}\nBody :{}\nCombined : {}\n'.format( head_file_name, body_file_name, combined_file_name)) if __name__ == "__main__": from Tree_models.utils.get_input_datas import get_head_body_tuples, get_head_body_tuples_test model_path = '../../pickled_data' max_features = 5000 # 메모리가 터질시 max_features를 낮게 조정 filename = 'count_1st_' + str(max_features) + '_vecterizer_model.pkl' head, body = get_head_body_tuples(data_path='../../data') head_test, body_test = get_head_body_tuples_test(data_path='../../data') count_vectorizer = CountVector_generator\ (max_features=max_features, analyzer='word', ngram_range=(1, 3), stop_words='english') count_vectorizer.fit(head, body) # 저장된 Vecterizer 모델이 있으면 load_model을 사용하면 됨 count_vectorizer.save_model(model_path=model_path, filename=filename) # count_vectorizer.load_model(model_path=model_path, filename=filename) # 저장된 TFIDF vector가 있으면 바로 해당 데이터로 training 하면 됨 # 변환된 train file 저장 count_vectorizer.transform_and_save_data(head, body, save_path=model_path,
from Tree_models.utils.get_input_datas import get_head_body_tuples, get_head_body_tuples_test, get_y_labels from sklearn.tree import DecisionTreeClassifier from sklearn.feature_extraction.text import CountVectorizer import numpy as np from Tree_models.utils.score import report_score head_train, body_train = get_head_body_tuples() head_test, body_test = get_head_body_tuples_test() train_y, test_y = get_y_labels() count_vec = CountVectorizer(analyzer='word', ngram_range=(1, 1), stop_words='english', max_features=2500) count_vec.fit([h + ". " + b for h, b in zip(head_train, body_train)]) # count_vocab = count_vec.vocabulary_ print('count_vec ...') head_train = count_vec.transform(head_train) body_train = count_vec.transform(body_train) head_test = count_vec.transform(head_test) body_test = count_vec.transform(body_test) print('count_vec finish...') # print(head_train) train_data = np.concatenate((head_train.toarray(), body_train.toarray()), axis=1) test_data = np.concatenate((head_test.toarray(), body_test.toarray()), axis=1) # print('train Decision tree') clf = DecisionTreeClassifier() clf.fit(train_data, train_y)
def save_summation_vectors(data_path, glove_path, glove_file, save_path, dim=50): model = load_Glove(glove_path=glove_path, glove_file=glove_file) h_train, b_train = get_head_body_tuples(data_path=data_path) h_test, b_test = get_head_body_tuples_test(data_path=data_path) sum_head_train = [] sum_body_train = [] sum_head_test = [] sum_body_test = [] for h, b in tqdm(zip(h_train, b_train)): words_h = h.split() words_b = b.split() head_vectors = np.zeros(dim) body_vectors = np.zeros(dim) for wh, wb in zip(words_h, words_b): if wh in model.keys(): head_vectors += model[wh] if wb in model.keys(): body_vectors += model[wb] # print(body_vectors) sum_head_train.append(head_vectors) sum_body_train.append(body_vectors) # exit() # break for h, b in tqdm(zip(h_test, b_test)): words_h = h.split() words_b = b.split() head_vectors = np.zeros(dim) body_vectors = np.zeros(dim) for wh, wb in zip(words_h, words_b): if wh in model.keys(): head_vectors += model[wh] if wb in model.keys(): body_vectors += model[wb] sum_head_test.append(head_vectors) sum_body_test.append(body_vectors) sum_head_train, sum_body_train, sum_head_test, sum_body_test = \ np.array(sum_head_train), np.array(sum_body_train), np.array(sum_head_test), np.array(sum_body_test) print(sum_head_train.shape) print(sum_body_train.shape) print(sum_head_test.shape) print(sum_body_test.shape) # np.hstack((sum_head_train, sum_body_train)) # print(np.hstack((sum_head_train, sum_body_train)[0]) # # print(np.hstack((sum_head_train, sum_body_train)[1]))) pkl.dump( np.hstack((sum_head_train, sum_body_train)), open(save_path + "/glove{}D_sum_head_body_train.pkl".format(dim), 'wb'), pkl.HIGHEST_PROTOCOL) print('file saved {}'.format(save_path + "/glove200D_sum_head_body_train.pkl")) pkl.dump( np.hstack((sum_head_test, sum_body_test)), open(save_path + "/glove{}D_sum_head_body_test.pkl".format(dim), 'wb'), pkl.HIGHEST_PROTOCOL) print('file saved {}'.format(save_path + "/glove200D_sum_head_body_test.pkl"))