def train(nb_epochs, batch_size, learning_rate, save_path=os.getcwd(), split_data=True): model = create_model() adam = Adam(lr=learning_rate) model.compile(optimizer=adam, loss='mse', metrics=['acc']) train_path = os.path.join(os.getcwd(), "training") val_path = os.path.join(os.getcwd(), "validation") train_images_path = os.path.join(train_path, 'images') val_images_path = os.path.join(val_path, 'images') nb_train = data_process.get_number_of_data(train_images_path) nb_val = data_process.get_number_of_data(val_images_path) print('Broj podataka za treniranje prije podjele:', nb_train) print('Broj podataka za testiranje prije podjele:', nb_val) if split_data: data_process.split_data(train_images_path, val_images_path, 0.1) nb_train = data_process.get_number_of_data(train_images_path) nb_val = data_process.get_number_of_data(val_images_path) print('Broj podataka za treniranje nakon podjele:', nb_train) print('Broj podataka za testiranje nakon podjele', nb_val) train_generator = data_process.generator(2, train_path) validation_generator = data_process.generator(2, val_path) train_steps = data_process.get_number_of_data(train_images_path) // batch_size val_steps = data_process.get_number_of_data(val_images_path) // batch_size model.fit(train_generator, batch_size=batch_size, steps_per_epoch=train_steps, epochs=nb_epochs, validation_data=validation_generator, validation_steps=val_steps) save_path = os.path.join(save_path, 'model.h5') model.save(save_path)
def main(): # Get the data data_train = pd.read_csv('dataset/train.csv') data_test = pd.read_csv('dataset/test.csv') # Transforming and dividing features Id_test = data_test['PassengerId'] selected_features = ['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] df_train, df_test = data_process.transform_features(data_train, data_test, selected_features) df_train, df_test = data_process.features_scaling(df_train, df_test, selected_features) X_train, Y_train, X_test, Y_test, test_X = data_process.split_data(df_train, df_test, selected_features) # Set parameters parameters = {} parameters['model_path'] = 'model/Titanic.ckpt' parameters['n_input'], parameters['n_features'] = X_train.shape parameters['n_hidden'] = 2 parameters['hidden_dim'] = 40 parameters['n_class'] = 1 parameters['learning_rate'] = 0.01 parameters['training_epochs'] = 15000 parameters['visualize'] = False if ((len(argv) > 1 and argv[1] == '-v') or (len(argv) > 2 and argv[2] == '-v')): parameters['visualize'] = True # Get model & train titanic_model = model.make_model(parameters) if (len(argv) > 1 and argv[1] == '-n') or (len(argv) > 2 and argv[2] == '-n'): model.neural_network(X_train, Y_train, parameters, titanic_model, X_test, Y_test) # Print accuracy if os.path.isfile(parameters['model_path']) == True: accuracy_estimation.Accuracy(parameters, titanic_model, X_train, Y_train, X_test, Y_test) # Output the submission to estimation.csv if os.path.isfile(parameters['model_path']) == True: accuracy_estimation.Estimation(parameters, titanic_model, test_X, Id_test) else: print("\nNo model found, please create a new file named 'Titanic.ckpt' in a directory named 'model' and launch the programme with th folowing commande :\n'python3 main.py -n'\n")
def t1(): num_samp_per_class = 2 dim = 2 N_class = 4 X, labels = gen_toy_data(dim, N_class, num_samp_per_class) X_norm, mean, std = normalize(X) X_norm, mean, U, S = PCA_white(X_norm) layer_param = [dim, 100, 100, N_class] overfit_tinydata(X_norm, labels, layer_param) X_train, labels_train, X_val, labels_val, X_test, labels_test = split_data( X_norm, labels) check_gradient(X, labels, [2, 100, 4], True)
def __init__(self, train_data_path, output_test_path, max_iter=50, max_time=10, C=9, tolerance=0.0001, kernel=SMO.linear_kernel): self.data = read_data(train_data_path) self.output_test_data = read_data(output_test_path) # TODO change to submit format self.training_data, self.testing_data = split_data(self.data) self.train_X, self.train_Y = self.training_data[:, :-1], np.squeeze(self.training_data[:, -1:]) self.test_X, self.test_Y = self.testing_data[:, :-1], np.squeeze(self.testing_data[:, -1:]) # print(self.train_X.shape, self.train_Y.shape) # self.alphas = np.random.randn(len(self.train_X)) self.alphas = np.zeros(len(self.train_X)) self.b = 0.0 self.m = len(self.train_X) self.max_iter = max_iter self.max_time = max_time self.kernel = kernel self.C = C self.tolerance = tolerance
def t2(): num_samp_per_class = 200 dim = 2 N_class = 4 # 生成数据 X, labels = gen_toy_data(dim, N_class, num_samp_per_class) X_norm, mean, std = normalize(X) X_norm, mean, U, S = PCA_white(X_norm) X_train, labels_train, X_val, labels_val, X_test, labels_test = split_data( X_norm, labels) lr = 10**(-2.1) lr_decay = 1 reg = 10**(-4.3) mu = 0.9 max_epoch = 10000 # 训练 layer_param = [dim, 100, 100, N_class] train_net(X_train, labels_train, layer_param, lr, lr_decay, reg, mu, max_epoch, X_val, labels_val)
output, edges, edges_attr, se_name = load_data(args.modular_file, args.ddi_file, 'onehot') print(len(list(output.keys()))) args.num_edge_features = edges_attr.size(1) args.device = 'cpu' # split data into train val test. num_edges = edges_attr.size(0) // 2 train_num = int(num_edges * args.train_ratio) val_num = int(num_edges * args.val_ratio) test_num = int(num_edges * args.test_ratio) nums = [train_num, val_num, test_num] # change the input to the the side effect name train_edges, train_edges_attr, val_edges, val_edges_attr, test_edges, test_edges_attr \ = split_data(edges, se_name, nums) # print(train_edges_attr) train_name = train_edges_attr val_name = val_edges_attr test_name = test_edges_attr train_edges_attr = name_to_feature(train_edges_attr) val_edges_attr = name_to_feature(val_edges_attr) test_edges_attr = name_to_feature(test_edges_attr) # read negative samples from file neg_train_edges, neg_train_attr, neg_val_edges, neg_val_attr, neg_test_edges, neg_test_attr = read_negative( ) print('negative samples generated') print(args.device) if args.feature_type == 'onehot':
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import numpy as np from data_process import load_data, split_data dat_dir = '../data/' ratings = load_data(dat_dir + "data_train.csv") print(np.shape(ratings)) _, train, test = split_data(ratings, p_test=0.1) from SGD_helpers import init_MF, matrix_factorization_SGD from MF_helpers import get_bias_train, get_bias_test bias_train, overal_bias, bias_u_train, bias_i_train = get_bias_train( train) #ratings for final submissions bias_test = get_bias_test(test, overal_bias, bias_u_train, bias_i_train) # Grid Search: grid = np.zeros((3, 4, 4)) gamma = 0.025 num_features = np.array([20, 50, 100]) lambda_user = np.logspace(-3, 0, 4)[::-1] lambda_item = np.logspace(-3, 0, 4)[::-1] num_epochs = 20 best_user_features = [] best_item_features = []
6: [10, 11, 23, 32], 7: [14, 25, 26, 35], 8: [15, 18, 28, 37], 9: [16, 21, 27, 36] } else: cluster_disease = None cluster_label = random_cluster(args.n_tasks, args.n_outputs, cluster_disease=cluster_disease) train_data, test_data, vocabulary, embedding = \ gen_data(train_path, test_path, args, fasttext_path) split_train = split_data(train_data, args.n_tasks, cluster_label) split_test = split_data(test_data, args.n_tasks, cluster_label) print("Loading and preprocessing done") memory_sent_data = [] memory_word_data = [] memory_sent_embed = [] memory_word_embed = [] save_word_embed = [] cur_model = None word_alignment_model = None sent_alignment_model = None results = [] random.seed(args.seed) for t in range(args.n_tasks):
import json # we need to use the JSON package to load the data, since the data is stored in JSON format from data_process import split_data, preprocess, load_data, save_data with open("data/reddit.json") as fp: data = json.load(fp) # Now the data is loaded. # It a list of data points, where each datapoint is a dictionary with the following attributes: # popularity_score : a popularity score for this comment (based on the number of upvotes) (type: float) # children : the number of replies to this comment (type: int) # text : the text of this comment (type: string) # controversiality : a score for how "controversial" this comment is (automatically computed by Reddit) # is_root : if True, then this comment is a direct reply to a post; if False, this is a direct reply to another comment # Example: data_point = data[0] # select the first data point in the dataset # Now we print all the information about this datapoint for info_name, info_value in data_point.items(): print(info_name + " : " + str(info_value)) features = ['text', 'is_root', 'controversiality', 'children'] # list of features to preprocess train, val, test = split_data(data) train_ = preprocess(train, feature_list=features, max=500) val_ = preprocess(val, feature_list=features) test_ = preprocess(test, feature_list=features) save_data(train_, val_, test_)