def main(): distance_funcs = { 'euclidean': Distances.euclidean_distance, 'minkowski': Distances.minkowski_distance, 'cosine_dist': Distances.cosine_similarity_distance, } scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) # tuner_without_scaling_obj = HyperparameterTuner() # tuner_without_scaling_obj.tuning_without_scaling(distance_funcs, x_train, y_train, x_val, y_val) # # print("**Without Scaling**") # print("k =", tuner_without_scaling_obj.best_k) # print("distance function =", tuner_without_scaling_obj.best_distance_function) tuner_with_scaling_obj = HyperparameterTuner() tuner_with_scaling_obj.tuning_with_scaling(distance_funcs, scaling_classes, x_train, y_train, x_val, y_val) print("\n**With Scaling**") print("k =", tuner_with_scaling_obj.best_k) print("distance function =", tuner_with_scaling_obj.best_distance_function) print("scaler =", tuner_with_scaling_obj.best_scaler)
def main(): distance_funcs = { 'euclidean': Distances.euclidean_distance, 'minkowski': Distances.minkowski_distance, 'gaussian': Distances.gaussian_kernel_distance, 'inner_prod': Distances.inner_product_distance, 'cosine_dist': Distances.cosine_similarity_distance, } scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) print('x_val shape = ', x_val.shape) print('y_val shape = ', y_val.shape) print('x_test shape = ', x_test.shape) print('y_test shape = ', y_test.shape) tuner_without_scaling_obj = HyperparameterTuner() tuner_without_scaling_obj.tuning_without_scaling(distance_funcs, x_train, y_train, x_val, y_val) print("**Without Scaling**") print("k =", tuner_without_scaling_obj.best_k) print("distance function =", tuner_without_scaling_obj.best_distance_function) print("f1_score=", tuner_without_scaling_obj.best_f1_score) pred = tuner_without_scaling_obj.best_model.predict(x_test) correct = 0 for i in range(len(pred)): if pred[i] == y_test[i]: correct += 1 accuracy = float(correct)/len(pred) print ("Accuracy is: ", accuracy) print ("F1 Score: ", f1_score(y_test, pred)) tuner_with_scaling_obj = HyperparameterTuner() tuner_with_scaling_obj.tuning_with_scaling(distance_funcs, scaling_classes, x_train, y_train, x_val, y_val) # print("\n**With Scaling**") print("k =", tuner_with_scaling_obj.best_k) print("distance function =", tuner_with_scaling_obj.best_distance_function) print("scaler =", tuner_with_scaling_obj.best_scaler) print("f1_score=", tuner_with_scaling_obj.best_f1_score) pred_2 = tuner_with_scaling_obj.best_model.predict(x_test) correct_2 = 0 for i in range(len(pred_2)): if pred_2[i] == y_test[i]: correct_2 += 1 accuracy_2 = float(correct_2)/len(pred_2) print ("Accuracy is: ", accuracy_2) print ("F1 Score:", f1_score(y_test, pred_2))
def main(): x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) scaler = MinMaxScaler() x_train = scaler(x_train) x_val = scaler(x_val) scaler = MinMaxScaler() x_test = scaler(x_test)
def main(): distance_funcs = { 'euclidean': Distances.euclidean_distance, 'minkowski': Distances.minkowski_distance, 'gaussian': Distances.gaussian_kernel_distance, 'inner_prod': Distances.inner_product_distance, 'cosine_dist': Distances.cosine_similarity_distance, } scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) print(list(distance_funcs.keys()).index('cosine_dist')) print(x_train[0]) print(x_train[1]) print(Distances.euclidean_distance(x_train[0], x_train[1])) print(Distances.minkowski_distance(x_train[0], x_train[1])) print(Distances.inner_product_distance(x_train[0], x_train[1])) print(Distances.cosine_similarity_distance(x_train[0], x_train[1])) print(Distances.gaussian_kernel_distance(x_train[0], x_train[1])) a = [[1], [4], [4], [10], [11], [23], [0], [50]] b = [[3, 4], [1, -1], [0, 0]] labels = [0, 0 ,0, 1, 1, 1, 0, 1] k = KNN(4, Distances.euclidean_distance) k.train(x_train, y_train) print(k.get_k_neighbors([2])) print(k.get_k_neighbors([28])) print(k.predict(x_val)) tuner_without_scaling_obj = HyperparameterTuner() tuner_without_scaling_obj.tuning_without_scaling(distance_funcs, x_train, y_train, x_val, y_val) print("**Without Scaling**") print("k =", tuner_without_scaling_obj.best_k) print("distance function =", tuner_without_scaling_obj.best_distance_function) x = NormalizationScaler() tuner_with_scaling_obj = HyperparameterTuner() tuner_with_scaling_obj.tuning_with_scaling(distance_funcs, scaling_classes, x_train, y_train, x_val, y_val) print("\n**With Scaling**") print("k =", tuner_with_scaling_obj.best_k) print("distance function =", tuner_with_scaling_obj.best_distance_function) print("scaler =", tuner_with_scaling_obj.best_scaler)
def main(): distance_funcs = { 'canberra': Distances.canberra_distance, 'minkowski': Distances.minkowski_distance, 'euclidean': Distances.euclidean_distance, 'gaussian': Distances.gaussian_kernel_distance, 'inner_prod': Distances.inner_product_distance, 'cosine_dist': Distances.cosine_similarity_distance, } x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) classfier = KNN(k=5, distance_function=distance_funcs['canberra']) classfier.train(x_train, y_train) pred = classfier.predict(x_val) s = 0
def main(): distance_funcs = { 'euclidean': Distances.euclidean_distance, 'minkowski': Distances.minkowski_distance, 'gaussian': Distances.gaussian_kernel_distance, 'inner_prod': Distances.inner_product_distance, 'cosine_dist': Distances.cosine_similarity_distance, } scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } x_train, y_train, x_val, y_val, x_test, y_test = data_processing() print('x_train shape = ', x_train.shape) print('y_train shape = ', y_train.shape) knn = KNN(11, Distances.euclidean_distance) knn.train(x_train, y_train) knn.predict(x_val)
def train_model(self, **kwargs): #pass in the training/testing datasets #this needs to pass in everytime bc the training data might change data_dict = data_processing() X_train = data_dict['X_train'] y_train = data_dict['y_train'] X_test = data_dict['X_test'] y_test = data_dict['y_test'] #build model based on the given model name by user model = self.model_func model.set_params(**kwargs) model.fit(X_train, y_train) y_pred = model.predict(X_test) self.params = kwargs self.accuracy_score = accuracy_score(y_test, y_pred) self.feature_importance = model.feature_importances_ self.version += 1 self.model_func = model #create pickledump objectives which incude the model and the current accuracy_score for this model try: pickle.dump(self, open(self.picklefile_name, 'ab')) except: raise Exception( f'Problem occured when writing to {self.picklefile_name}')
def test_transform_model_selection(): Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing() model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain.tolist(), ytrain.tolist(), Xval.tolist(), yval.tolist())
import tensorflow as tf import model as ml import data from configs import DEFINES # 사전을 구성 한다. char2idx, idx2char, vocabulary_length = data.load_vocabulary() # 학습 데이터와 테스트 데이터를 가져온다. train_input, train_label, eval_input, eval_label = data.load_data() # 학습 데이터 : 인코딩, 디코딩 입력, 디코딩 출력을 만든다. train_input_enc = data.data_processing(train_input, char2idx, DEFINES.enc_input) train_input_dec = data.data_processing(train_label, char2idx, DEFINES.dec_input) train_target_dec = data.data_processing(train_label, char2idx, DEFINES.dec_target) # 평가 데이터 : 인코딩, 디코딩 입력, 디코딩 출력을 만든다. eval_input_enc = data.data_processing(eval_input, char2idx, DEFINES.enc_input) eval_input_dec = data.data_processing(eval_label, char2idx, DEFINES.dec_input) eval_target_dec = data.data_processing(eval_label, char2idx, DEFINES.dec_target) # 에스티메이터 구성한다. classifier = tf.estimator.Estimator(model_fn=ml.Model, model_dir=DEFINES.check_point_path, params={ 'embedding_size': DEFINES.embedding_size,
# for auto-reloading external modules # see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython import numpy as np from hw1_knn import KNN from utils import euclidean_distance, gaussian_kernel_distance, inner_product_distance, cosine_sim_distance from utils import f1_score, model_selection_without_normalization, model_selection_with_transformation distance_funcs = { 'euclidean': euclidean_distance, 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, 'cosine_dist': cosine_sim_distance, } from data import data_processing Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing() #best_model, best_k, best_function = model_selection_without_normalization(distance_funcs, Xtrain, ytrain, Xval, yval) from utils import NormalizationScaler, MinMaxScaler scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } #best_model, best_k, best_function, best_scaler = model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval) import data import hw1_dt as decision_tree import utils as Utils
def test_classify(model): from data import data_processing Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing() model.train(Xtrain, ytrain) predicted_labels = model.predict(Xtest)
'model_hidden_size': DEFINES.model_hidden_size, 'ffn_hidden_size': DEFINES.ffn_hidden_size, 'attention_head_size': DEFINES.attention_head_size, 'learning_rate': DEFINES.learning_rate, 'vocabulary_length': vocabulary_length, 'layer_size': DEFINES.layer_size, 'max_sequence_length': DEFINES.max_sequence_length, }) print("\n# 채팅 준비중 입니다...") estimator_predictor = tf.contrib.predictor.from_estimator(classifier, serving_input_fn) #요걸 쓰면 에스티메이터 써도 매번불러오지 않음 # 모델 빌드, checkpoint 로드를 위해 한 번의 dummy 예측을 수행한다. print("# 모델을 빌드하고 checkpoint를 로드하고 있습니다...") predic_input_enc = data.data_processing(["dummy"], char2idx, DEFINES.enc_input) predic_output_dec = data.data_processing([""], char2idx, DEFINES.dec_input) predictions = estimator_predictor({"input": predic_input_enc, "output": predic_output_dec}) print("# 채팅 준비가 완료됐습니다.") print("# 채팅을 종료하려면 'quit'를 입력하세요") for q in range(1000): question = input("Q: ") if question == 'quit': break predic_input_enc = data.data_processing([question], char2idx, DEFINES.enc_input) predic_output_dec = data.data_processing([""], char2idx, DEFINES.dec_input) predictions = estimator_predictor({"input": predic_input_enc, "output": predic_output_dec}) sentence_string = [idx2char[index] for index in predictions['indexs'][0]]
# This Source Code Form is subject to the terms of the MIT # License. If a copy of the same was not distributed with this # file, You can obtain one at # https://github.com/akhilpandey95/scholarlyimpact/blob/master/LICENSE. import sys from models import PredictCitationsExist from evaluation import evaluate, clf_metrics from data import data_processing, prepare_X_Y from sklearn.model_selection import train_test_split if __name__ == '__main__': # load the dataset data = data_processing('~/Downloads/sch_impact.csv') # prepare the X, Y X, Y = prepare_X_Y(data, 'target_exp_1') # build the train and test samples X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) # build the model classifier = PredictCitationsExist() # train the model classifier = classifier.train(10, 64, X_train, X_test, Y_train, Y_test, stopping=False) # evaluate and print the training stats model_evaluation = evaluate(classifier, 'train', x_train=X_train, y_train=Y_train) # print training metrics
def pred_indices(preds, beta=1.0): preds = np.asarray(preds).astype('float64') preds = np.log(preds + 0.000000001) / beta exp_preds = np.exp(preds) preds = exp_preds / np.sum(exp_preds) probs = np.random.multinomial(1, preds, 1) return np.argmax(probs) # 채팅 시작 for i in range(10): question = input("Q: ") if question == 'quit': break predic_input_enc = data.data_processing([question], word2idx, DEFINES.enc_input) predic_output_dec = data.data_processing([""], word2idx, DEFINES.dec_input) predic_target_dec = data.data_processing([""], word2idx, DEFINES.dec_target) predictions = classifier.predict(input_fn=lambda: data.input_fn( predic_input_enc, predic_output_dec, predic_target_dec, 1, 1 )) #for문을 돌때마다 네트워크 빌드& 체크포인트 적용, 작업을 수행한다->속도저하 # Estimator의 특성->보완이 필요한 부분 # 답변 문장에 대한 softmax 확률을 받는다. prob = np.array([v['indexs'] for v in predictions]) prob = np.squeeze(prob) # 확률적으로 답변 문장의 인덱스를 생성한다. words_index = [pred_indices(p, beta=DEFINES.softmax_beta) for p in prob]
'euclidean': euclidean_distance, 'gaussian': gaussian_kernel_distance, 'inner_prod': inner_product_distance, 'cosine_dist': cosine_sim_distance, } scaling_classes = { 'min_max_scale': MinMaxScaler, 'normalize': NormalizationScaler, } from data import data_processing Xtrain = [[1, 1], [1, 1.25], [1.25, 1], [1.4, 1.75], [1.75, 1.75], [1.80, 1.75], [2, 1.75], [1.75, 2.25], [2, 2.5], [2, 3], [2.15, 3], [2.45, 3], [2.5, 3], [2.75, 3], [3, 3]] ytrain = [[0], [0], [0], [0], [1], [1], [1], [1], [1], [2], [2], [2], [2], [2], [2]] Xtest = data_processing() best_model, best_k, best_function = model_selection_without_normalization( distance_funcs, Xtrain, ytrain, Xtrain, ytrain) print(pr(best_model, Xtest, ytest)) ''' from utils import Information_Gain, get_branches ,get_amount_cls features = [[2, 3, 4, 15, 20], [4, 6, 11, 13, 15], [2, 5, 10, 13, 20], [7, 10, 11, 18, 19], [11, 13, 16, 18, 19], [3, 5, 11, 15, 18], [3, 6, 8, 12, 15], [5, 11, 15, 17, 18], [2, 3, 4, 12, 18],
def test_model_selection(): Xtrain, ytrain, Xval, yval, Xtest, ytest = data_processing() model_selection_without_normalization(distance_funcs, Xtrain.tolist(), ytrain.tolist(), Xval.tolist(), yval.tolist())
# This Source Code Form is subject to the terms of the MIT # License. If a copy of the same was not distributed with this # file, You can obtain one at # https://github.com/akhilpandey95/altpred/blob/master/LICENSE. import sys from models import * from evaluation import evaluate, clf_metrics from data import data_processing, prepare_word_embeddings from sklearn.model_selection import train_test_split if __name__ == '__main__': # load the dataset #data = data_processing('altmetrics_j2014_full_gamma.csv', 'binary') data = data_processing('altmetrics_j2014_full_gamma.csv', 'binary-delta') # prepare the X, Y X, Y, max_words, max_len = prepare_word_embeddings(data, 'abstract', 'target') # build the train and test samples X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.35) # build the model classifier = AltpredTwitterGRU(max_words, max_len) #classifier = AltpredTwitterLSTM(max_words, max_len) #classifier = AltpredTwitterBiDirLSTM(max_words, max_len) # train the model classifier = classifier.train(5, 256, X_train, Y_train, stopping=False)