def my_data(self): # load training data train_images = util.np.load( os.path.join(self.args.data_dir, 'fmnist_train_data.npy')) train_labels = util.np.load( os.path.join(self.args.data_dir, 'fmnist_train_labels.npy')) train_labels = tf.Session().run(tf.one_hot(train_labels, 10)) # normalize data train_images = train_images / 255 # set up test set test_images, train_images = util.split_data(train_images, self.test_set_size) test_labels, train_labels = util.split_data(train_labels, self.test_set_size) # set up validation set train_images, train_labels = util.shuffler(train_images, train_labels) validation_images, train_images = util.split_data( train_images, self.validation_set_size) validation_labels, train_labels = util.split_data( train_labels, self.validation_set_size) data = { "train_images": train_images, "train_labels": train_labels, "test_images": test_images, "test_labels": test_labels, "validation_images": validation_images, "validation_labels": validation_labels } return data
def trial_init(recdr, logr): logr.log('Initializing new trial...', 'standard') b = DataGenerator() b.set_baseline_response_prob(baseline) b.add_random_user_attrs(num_user_atts, min_user_att_levels, max_user_att_levels) b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels, max_msg_att_levels) templates = b.set_random_propensities(num_propensity_groups, min_group_user_atts, max_group_user_atts, min_group_msg_atts, max_group_msg_atts, min_group_pos_prob, max_group_pos_prob) # -> Returns: a pair (user templates, interaction templates) logr.log('Generating data...', 'standard') messages = b.gen_random_inters(num_test_messages) users = b.gen_random_users(num_users) #rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages)) rows = ut.unzip(b.gen_random_rows_from(users, messages)) logr.log('Number of rows: ' + str(len(rows)), 'standard') # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, messages, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments return (train, test_users, b, solvers)
def test_add_user_data(ident_service): # No transactions in the chain yet assert ident_service.last_transaction_hash == b'Base' # Create the data and signature data = json.dumps({'first_name': 'Bob', 'last_name': 'Smith'}).encode() signature = sign(user_1_private_key, data) user_data = UserData(data, signature) # Add the data to the service tx_hash = ident_service.add_user_data(user_data, user_1_cert_string) # Ensure the head of the chain is the new transaction assert tx_hash == ident_service.last_transaction_hash # Get the transaction and encryption key tx = ident_service.get_transaction(tx_hash) key = ident_service.get_key(tx_hash) # Ensure the transaction points to the old head of the chain assert tx.hash_pointer == b'Base' # Get the message and signature from the transaction decrypted = decrypt(key, tx.action.get_data()) message, signature = split_data(decrypted) # Ensure the data has not been tampered with verify(user_1_cert, message, signature) # Ensure the data matches what the user uploaded assert message == data
def share_data(self, user_data, user_cert, service_provider_cert): # get the latest transaction for the user latest_tx_hash = self.latest_tx_list[user_cert] latest_tx = self.transaction_pool[latest_tx_hash] key = self.keys[latest_tx_hash] # get the action from the transaction and decrypt the data decrypted = decrypt(key, latest_tx.action.get_data()) message, _signature = split_data(decrypted) # check that the data to share matches the data on record is_consistent = check_data_consistency(json.loads(user_data.data), json.loads(message)) # if it is not consistent, do not add the action to a transaction if not is_consistent: return None # if it is consistent, create a user share action new_share_action = UserDataShareAction(user_data, service_provider_cert) # create a transaction for the action transaction = Transaction(self.last_transaction_hash, new_share_action) # add the transaction to the chain and return the hash pointer return self.add_transaction_to_chain(transaction)
def trial_init(recdr, logr): logr.log('Initializing new trial...', 'standard') b = DataGenerator() b.set_baseline_response_prob(baseline) b.add_random_user_attrs(num_user_atts, min_user_att_levels, max_user_att_levels) b.add_random_inter_attrs(num_msg_atts, min_msg_att_levels, max_msg_att_levels) templates = b.set_random_propensities( num_propensity_groups, min_group_user_atts, max_group_user_atts, min_group_msg_atts, max_group_msg_atts, min_group_pos_prob, max_group_pos_prob) # -> Returns: a pair (user templates, interaction templates) logr.log('Generating data...', 'standard') messages = b.gen_random_inters(num_test_messages) rows = ut.unzip(b.gen_crossprod_rows(b.unique_users(), messages)) logr.log('Number of rows: ' + str(len(rows)), 'standard') # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, messages, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments return (train, test_users, b, solvers)
def test_tree_classifier(): """ :return: None Function to test decision tree classifier """ # X, Y = get_adult_data() # attr_types = [int for _ in range(X.shape[1])] data = load_breast_cancer() X = data.data Y = data.target.reshape(data.target.size) attr_types = [float for _ in range(X.shape[1])] Xtrain, Ytrain, Xtest, Ytest = split_data(X, Y, 0.8) model = ClassificationTree() print("Training..") model.train(Xtrain, Ytrain, attr_types) model.prune_tree(Xtrain, Ytrain) cY = model.predict(Xtest) print("Accuracy: {}".format(accuracy(Ytest, cY))) clf = tree.DecisionTreeClassifier() clf.fit(Xtrain, Ytrain.reshape(Ytrain.size)) cY = clf.predict(Xtest) print("Scikit accuracy: ".format(accuracy(Ytest, cY)))
def train(data, args): dataset, T, x, b = data dim = dataset.shape[0] shard, interval = util.split_data(dataset) size = shard.shape[0] kv_x = mx.nd.zeros((dim, 1)) kv_d = mx.nd.zeros((dim, 1)) # create kvstore kvstore = util.create_kvstore(kv_x) A = dataset lambda_ = np.dot((x.T * A.T), (A * x))[0][0] gamma = args.learning_rate logging.info('Start training.') for epoch in range(args.epoch_num): # gradient in this epoch t1 = lambda_ / dim * x - b / dim nnz = A.getnnz(0).reshape((dim, 1)) g = (np.multiply(nnz, t1) + t1 * dim + T * x) / dim start, end = 0, min(args.batch_size, size) while True: x_prime = x.copy() for i in range(start, end): if util.check_cancel(i, start, end): logging.info('restart computation') break # compute update u = lambda_ / dim * gamma * x - gamma * ( g - lambda_ / dim * x) - gamma * np.sum( shard[i] * (x - x_prime)) * shard[i].T # update local vector x = x - u kv_d = kv_d - mx.nd.array(u.getA()) # exchange with kvstore util.update_param(kvstore, kv_d, kv_x, pull_only=util.need_restart()) kv_d = mx.nd.zeros((dim, 1)) x = kv_x.asnumpy() if not util.need_restart(): start, end = end, min(end + args.batch_size, size) if start == end: break util.reset_cancel() # compute objective loss = size / dim * np.dot((lambda_ / 2 * x - b).T, x) for i in range(*interval): loss -= (A[i] * x)**2 / 2 logging.info('Epoch[{}] loss={}'.format(epoch, np.sum(loss) + 2))
def main(data_file, vocab_path): """Build and evaluate Naive Bayes classifiers for the federalist papers""" authors, essays, essay_ids = parse_federalist_papers(data_file) function_words = load_function_words(vocab_path) # load the attributed essays into a feature matrix # label mapping is for me to track # make them into two classifiers, zero and one. # the distribution of the zero (ham) was higher? # the distribution of one (man) was higher? # output: two classes zero and one X = load_features(essays, function_words) # TODO: load the author names into a vector y, mapped to 0 and 1, using functions from util.py labels_map = labels_to_key(authors) print(labels_map) # y output, a list of zeros and ones, 相对应,第几篇文章里面是什么 # y is the golden standard, it is used for both training, and evaluation y = np.asarray(labels_to_y(authors, labels_map)) # numerical print(f"Numpy array has shape {X.shape} and dtype {X.dtype}") # TODO shuffle, then split the data # if split has already had a shuffle function embedded in it, no need for importing train, test = split_data(X, y, 0.25) # TODO: train a multinomial NB model, evaluate on validation split nbm = MultinomialNB() # to see what is the definition of nbm, what it requires as in the parameter # train is array, two tuples with [] in it, the first one is a array, teh second one is target # rows of X and the len of y are not identical. # y 的长度要大于X, 不能直接用y, 需要用剪裁过在train 里面的 nbm.fit(train[0], train[1]) # change preds_nbm = nbm.predict(test[0]) test_y = test[1] accuracy = calculate_accuracy(preds_nbm, test_y) print(f" the accuracy for multinomial NB model is {accuracy}") # TODO: train a Bernoulli NB model, evaluate on validation split nbb = BernoulliNB() nbb.fit(train[0], train[1]) preds_nbb = nbb.predict(test[0]) accuracy = calculate_accuracy(preds_nbb, test_y) print(f" the accuracy for Bernoulli NB model is {accuracy}") # TODO: fit the zero rule train_y = train[1] most_frequent_class = find_zero_rule_class(train_y) print(f"the most frequent class is {most_frequent_class}") test_predictions = apply_zero_rule(test[0], most_frequent_class) test_accuracy = calculate_accuracy(test_predictions, test_y) print(f" the accuracy for the baseline is {test_accuracy}")
def trial_init(recdr, logr): # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, 100, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments return (train, test_users, b, solvers)
def my_data(self): # load training data train_images = util.np.load( os.path.join(self.args.data_dir, 'cifar_images.npy')) # normalize data train_images = train_images / 255 # reshape to fit input tensor train_images = np.reshape( train_images, [-1, 32, 32, 3 ]) # `-1` means "everything not otherwise accounted for" # load training labels train_labels = util.np.load( os.path.join(self.args.data_dir, 'cifar_labels.npy')) train_images, train_labels = util.shuffler(train_images, train_labels) # convert labels to one-hots train_labels = tf.Session().run(tf.one_hot(train_labels, 100)) # set up test set test_images, train_images = util.split_data(train_images, self.test_set_size) test_labels, train_labels = util.split_data(train_labels, self.test_set_size) # set up validation set validation_images, train_images = util.split_data( train_images, self.validation_set_size) validation_labels, train_labels = util.split_data( train_labels, self.validation_set_size) data = { "train_images": train_images, "train_labels": train_labels, "test_images": test_images, "test_labels": test_labels, "validation_images": validation_images, "validation_labels": validation_labels } return data
def gen_dataset(sentences, categories, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output the category (one-hotted) Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True) vectorizer = lambda x: model[x] if x in model else np.zeros(300) encoder = one_hot_encoding(categories) X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, len(encoder.keys()))) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words param_dict['encoder'] = encoder for sent_i in I: words = sentences[sent_i] cats = categories[sent_i] if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, categories=cats, vectorizer=vectorizer, encoder=encoder, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = util.split_data(X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict
def train_lstm(inputs, outputs, state_size, batch_size=256, param_scale=0.001, num_epochs=5, step_size=0.001): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data( inputs, out_data=outputs, frac=0.80) input_size = tr_inputs.shape[2] output_size = tr_outputs.shape[2] init_params = init_lstm_params(input_size, state_size, output_size, param_scale=param_scale, rs=npr.RandomState(0)) num_batches = int(np.ceil(tr_inputs.shape[1] / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -lstm_log_likelihood( params, tr_inputs[:, idx, :], tr_outputs[:, idx, :]) # Get gradient of objective using autograd. objective_grad = grad(objective) print( " Epoch | Train accuracy | Train log-like | Holdout accuracy | Holdout log-like ") def print_perf(params, iter, gradient): train_acc = accuracy(params, tr_inputs, tr_outputs) train_ll = -lstm_log_likelihood(params, tr_inputs, tr_outputs) valid_acc = accuracy(params, va_inputs, va_outputs) valid_ll = -lstm_log_likelihood(params, va_inputs, va_outputs) print("{:15}|{:20}|{:20}|{:20}|{:20}".format( iter//num_batches, train_acc, train_ll, valid_acc, valid_ll)) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, init_params, step_size=step_size, num_iters=num_epochs, callback=print_perf) return optimized_params
def gen_dataset(sentences, max_words=78, train_test_split=True): ''' Generate a dataset of (input, output) pairs where the input is an embedded vector and output is an embedded vector for the lemmatized form. Args ---- sentences : list list of sentences where each sentence is list of tokens max_words : integer maximum number of words allowed in sentence train_test_split : boolean whether to split data into 2 sets ''' num_sentences = len(sentences) model = models.Word2Vec.load_word2vec_format( '../storage/pos_tagger/GoogleNews-vectors-negative300.bin', binary=True) vectorizer = lambda x: model[x] if x in model else np.ones(300 ) * ZERO_EPSILON lemmatizer = WordNetLemmatizer().lemmatize X = np.zeros((num_sentences, max_words, 300)) y = np.zeros((num_sentences, max_words, 300)) K = np.zeros(num_sentences) I = np.arange(num_sentences) param_dict = {} param_dict['max_words'] = max_words for sent_i, words in enumerate(sentences): if sent_i % 1000 == 0: print("{} sentences parsed. {} remaining.".format( sent_i, num_sentences - sent_i - 1)) X[sent_i, :, :], y[sent_i, :, :] = \ prepare_sentence(words, vectorizer=vectorizer, lemmatizer=lemmatizer, max_words=max_words) K[sent_i] = len(words) # keep track of num words in sentence if train_test_split: (X_train, X_test), (I_train, I_test) = split_data(X, out_data=I, frac=0.80) y_train, y_test = y[I_train], y[I_test] K_train, K_test = K[I_train], K[I_test] return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict return (X, y, K), param_dict
def opcion_1(data): training, test = pre_proc(data, 70) entrada, salida = split_data(training, salida_columns) entrada_test, salida_test = split_data(test, salida_columns) topology = [entrada[0].size, 8, 4, salida[0].size] epochs = 500 learning_rate = 0.1 """Topology es una lista con la cantida de neuronas en cada capa act_f es la funcion de activacion que sera usada por cada capa """ print('Creando red neuronal de topologia: ', topology) nn = neuronal_network.Network(topology, Sigmoid, MSE) print('Iniciando entrenamiento...') error = nn.train(entrada, salida, learning_rate, epochs) plot_loss(error, error_path) print('Entrenamiento finalizado.') opcion_4(nn, entrada_test, salida_test) return nn, entrada_test, salida_test
def trainAndSaveModel(X_train, y_train, y_label_index, max_iterations=7000, folds=False): n_features = X_train.shape[1] n_classes = len(util.classes) avg_coefficients = np.zeros((n_classes, n_features)) avg_intercepts = np.zeros(n_classes) data_kfold = util.split_data(y_index=y_label_index) train_accuracies = [] eval_accuracies = [] train_predictions = [] eval_predictions = [] for i, (X_train, y_train, X_val, y_val) in enumerate(data_kfold): print("Fold", i + 1) clf = LogisticRegression(max_iter=max_iterations, multi_class='multinomial', solver='newton-cg') clf.fit(X_train, y_train) train_acc = clf.score(X_train, y_train) train_accuracies.append(train_acc) eval_acc = clf.score(X_val, y_val) eval_accuracies.append(eval_acc) avg_coefficients += clf.coef_ avg_intercepts += clf.intercept_ train_predictions.append(clf.predict(X_train)) eval_predictions.append(clf.predict(X_val)) if folds: util.outputConfusionMatrix( clf.predict(X_train), y_train, "../figures/fold_" + str(i + 1) + "_train") util.outputConfusionMatrix( clf.predict(X_val), y_val, "../figures/fold_" + str(i + 1) + "_eval") print("train accuracy:", train_acc) print("eval accuracy:", eval_acc) avg_coefficients /= util.K avg_intercepts /= util.K model = { "coeff_": avg_coefficients, "intercept_": avg_intercepts, "train_accuracies": train_accuracies, "eval_accuracies": eval_accuracies, "train_predictions": train_predictions, "eval_predictions": eval_predictions } util.dumpVar("../models/avg_logistic_model", model)
def execute(data, training_data_ratio=2.0 / 3.0, k=1): """ Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression) :param data: Raw Data frame parsed from CSV :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training. :param k: Smoothing parameter for local weight computation :return: Nothing """ # 2. Randomize the data randomized_data = util.randomize_data(data) # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing training_data, test_data = util.split_data(randomized_data, training_data_ratio) training_outputs = util.get_output(training_data) # 4. Standardize the data(except for the last column of course) using the training data standardized_training_data, mean, std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front standardized_training_data.insert(0, "Bias", 1) std_test_data, _, _ = util.standardize_data(util.get_features(test_data), mean, std) std_test_data.insert(0, "Bias", 1) squared_errors = [] # 5. Then for each testing sample for i in xrange(0, len(std_test_data)): testing_sample = std_test_data.iloc[i] expected_output = test_data.loc[testing_sample.name][-1] theta_query = compute_theta_query(testing_sample, standardized_training_data, training_outputs, k) # (b) Evaluate the testing sample using the local model. actual_output = np.dot(testing_sample, theta_query) # (c) Compute the squared error of the testing sample. squared_errors.append(util.compute_se(expected_output, actual_output)) # 6. Compute the root mean squared error (RMSE) sum_of_squared_errors = 0 for error in squared_errors: sum_of_squared_errors += error mean_squared_error = sum_of_squared_errors / len(squared_errors) rmse = math.sqrt(mean_squared_error) return rmse
def test_share_data(ident_service): # Create the user data data = json.dumps({'first_name': 'Bob', 'last_name': 'Smith'}).encode() signature = sign(user_1_private_key, data) user_data = UserData(data, signature) # Add the user's data to the service tx_hash = ident_service.add_user_data(user_data, user_1_cert_string) # Create data to share with the service provider shared_data = json.dumps({'first_name': 'Bob'}).encode() signature = sign(user_1_private_key, shared_data) shared_user_data = UserData(shared_data, signature) # Add the shared data to the identity service shared_tx_hash = ident_service.share_data(shared_user_data, user_1_cert_string, sp_1_cert) # Ensure the head of the chain is the new transaction assert ident_service.last_transaction_hash == shared_tx_hash # Get the share transaction share_tx = ident_service.get_transaction(shared_tx_hash) # Ensure the share transaction points to the previous head of the chain assert tx_hash == share_tx.hash_pointer # As the service provider, get the encryption key and decrypt the data encrypted_encryption_key, encrypted_data = split_data( share_tx.action.get_data()) decrypted_encryption_key = decrypt_private(sp_1_private_key, encrypted_encryption_key) share_decrypted = decrypt(decrypted_encryption_key, encrypted_data) share_message, share_signature = split_data(share_decrypted) # Verify the data is signed by the user and hasn't been tampered with verify(user_1_cert, share_message, share_signature) # Ensure the data matches what the user uploaded to the service assert share_message == shared_data
def execute(dataframe, training_data_ratio=2.0 / 3): """ Execute Multi-class SVM :param dataframe: The input dataset containing the classifier as the last column :param training_data_ratio: The percentage of data to use for training (default: 2/3) :return: A list of metrics on performance for the one-vs-many, and the accuracy of one-vs-one SVM """ # Seed our randomizer to ensure we get repeatable results random.seed(0) # 2. Randomizes the data. randomized_data = util.randomize_data(dataframe) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardizes the data (except for the last column of course) using the training data training_features, training_targets = util.split_features_target( training_data) std_training_features, mean, std = util.standardize_data(training_features) # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero std_training_features.fillna(0, inplace=True) test_features, test_targets = util.split_features_target(test_data) std_test_features, _, _ = util.standardize_data(test_features, mean, std) # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero std_test_features.fillna(0, inplace=True) target_classes = training_targets.unique() # 5. First trains and evaluates using a One vs All approach: one_vs_many_metrics = execute_one_vs_many(std_test_features, std_training_features, target_classes, test_targets, training_targets) # 6. Trains and evaluates using a One vs One approach: num_classified_incorrectly = execute_one_vs_one(std_test_features, std_training_features, target_classes, test_targets, training_targets) num_classified_correctly = len(test_features) - num_classified_incorrectly one_vs_one_accuracy = num_classified_correctly / float(len(test_features)) return one_vs_many_metrics, one_vs_one_accuracy
def load_data(data_file, test_ratio_offset): with open(data_file, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') x = [] y = [] for row in csvreader: for i in range(0, len(row) - 1): if row[1 + i] == '?': row[1 + i] = 'a' x_conv = list(map(float, map(ord, row[1:]))) y_conv = [0. if row[0] == 'p' else 1.] # Remove question marks for u in x_conv: if u == 63.: u = -1. x.append(x_conv) y.append(y_conv) split_ratio = 0.9 x, y = util.shuffle_data(x, y) x_train, x_test, y_train, y_test = util.split_data(x, y, split_ratio) # Check that we can create a confusion matrix # (have at least 1 positive and negative sample in test set) while (len([result for result in y_test if result[0] == 0.]) < (0.5 - test_ratio_offset) * len(y_test) or len([result for result in y_test if result[0] == 1.]) < (0.5 - test_ratio_offset) * len(y_test)): x_train, x_test, y_train, y_test = util.split_data( x, y, split_ratio) return x_train, x_test, y_train, y_test print('[ERR] Failed to load data from file \'{0}\''.format(data_file)) exit()
def create_features_labels(save_to_disk=False): ''' Use pipeline to generate the (input, output) pairs for machine learning Args ---- save_to_disk : boolean write frequencies to a pickle file ''' lexicon = ctd.load_data(ctd.brown_generator(), return_sent_labels=True) # not efficient --> change me data = np.array([[t, l] for t, l in lexicon]) tokens = data[:, 0] labels = data[:, 1] # get features darrays = get_descriptor_arrays(tokens) labels = get_dummies(labels) # put into grams (give context) didx, darrays, labels = make_grams(darrays, 3, labels=labels, target_tag=EOS_PUNC) (tr_inputs, te_inputs), (tr_outputs, te_outputs) = split_data(darrays, out_data=labels, frac=0.80) if save_to_disk: np.save( local_ref('../storage/sentence_disambiguation/X_train.npy', tr_inputs)) np.save( local_ref('../storage/sentence_disambiguation/X_test.npy', te_inputs)) np.save( local_ref('../storage/sentence_disambiguation/y_train.npy', tr_outputs)) np.save( local_ref('../storage/sentence_disambiguation/y_test.npy', te_outputs)) return (tr_inputs, te_inputs), (tr_outputs, te_outputs)
def execute(self, dataframe): """ Execute the Binary-Artificial Neural Network problem :param dataframe: Input raw data :return: (final test error, list of training errors for each training iteration) """ # 2. Randomizes the data. print "Randomizing Data" random_data = util.randomize_data(dataframe) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing print "Splitting Test and Training Data" training_data, test_data = util.split_data(random_data, self._training_data_ratio) # 4. Standardizes the data (except for the last column of course as well as the bias feature) # using the training data print "Standardizing Training Data" standardized_training_data, mean, std = util.standardize_data( self.__select_features(training_data)) # 5. Trains an artificial neural network using the training data # Our last column is the label column # 6. During the training process, compute the training error after each iteration. # You will use this to plot the training error vs. iteration number. expected_training_outputs = self.__select_target_labels( training_data).values.reshape(-1, 1) print "Training Neural Network" training_errors = self._network.train_binary( standardized_training_data, expected_training_outputs, self._iterations) # 7. Classifies the testing data using the trained neural network. print "Classifying Testing Data" expected_test_output = self.__select_target_labels(test_data) std_test_data, _, _ = util.standardize_data( self.__select_features(test_data), mean, std) actual_test_output = self._network.evaluate(std_test_data.values) # 8. Compute the testing error. print "Computing Metrics" self.__update_metrics(expected_test_output, actual_test_output) test_error = self._metrics.calculate_error() print "Test Error: ", test_error return test_error, training_errors
def train_nn( inputs, outputs, num_hiddens, # don't include inputs and outputs batch_size=256, param_scale=0.1, num_epochs=5, step_size=0.001, L2_reg=1.0): # split data (again) into a training and a validation set (tr_inputs, va_inputs), (tr_outputs, va_outputs) = util.split_data( inputs, out_data=outputs, frac=0.80) num_input_dims = tr_inputs.shape[1] num_output_dims = tr_outputs.shape[1] layer_sizes = [num_input_dims] + num_hiddens + [num_output_dims] init_params = init_random_params(param_scale, layer_sizes) num_batches = int(np.ceil(tr_inputs.shape[0] / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective def objective(params, iter): idx = batch_indices(iter) return -log_posterior( params, tr_inputs[idx], tr_outputs[idx], L2_reg) # Get gradient of objective using autograd. objective_grad = grad(objective) print( " Epoch | Train accuracy | Train log-like | Holdout accuracy | Holdout log-like ") def print_perf(params, iter, gradient): if iter % num_batches == 0: train_acc = accuracy(params, tr_inputs, tr_outputs) train_ll = log_posterior(params, tr_inputs, tr_outputs, L2_reg) valid_acc = accuracy(params, va_inputs, va_outputs) valid_ll = log_posterior(params, va_inputs, va_outputs, L2_reg) print("{:15}|{:20}|{:20}|{:20}|{:20}".format( iter//num_batches, train_acc, train_ll, valid_acc, valid_ll)) # The optimizers provided can optimize lists, tuples, or dicts of # parameters. optimized_params = adam( objective_grad, init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf) return optimized_params
def main(data_file): print(data_file) # load the data authors, essays, essay_ids = parse_federalist_papers(data_file) num_essays = len(essays) print(f"Working with {num_essays} reviews") # create a key that links author id string -> integer author_key = labels_to_key(authors) print(len(author_key)) print(author_key) # convert all the labels using the key y = labels_to_y(authors, author_key) assert y.size == len( authors ), f"Size of label array (y.size) must equal number of labels {len(authors)}" # shuffle and split the data train, test = split_data(essays, y, 0.3) data_size_after = len(train[1]) + len(test[1]) assert data_size_after == y.size, f"Number of datapoints after split {data_size_after} must match size before {y.size}" print(f"{len(train[0])} in train; {len(test[0])} in test") # learn zero rule on train train_y = train[1] most_frequent_class = find_zero_rule_class(train_y) print(most_frequent_class) # lookup label string from class # reverse_author_key = {v: k for k, v in author_key.items()} print( f"The most frequent class is {reverse_author_key[most_frequent_class]}" ) # apply zero rule to test reviews test_predictions = apply_zero_rule(test[0], most_frequent_class) print(f"Zero rule predictions on held-out data: {test_predictions}") # score accuracy test_y = test[1] test_accuracy = calculate_accuracy(test_predictions, test_y) print(f"Accuracy of zero rule: {test_accuracy:0.03f}")
def __split_dataset(self, df): if self.sintetic: dir_src = const.DIR_SINTETIC_DATASET file_training_dst = const.SINTETIC_FILE_TRAINING file_test_dst = const.SINTETIC_FILE_TEST file_cv_dst = const.SINTETIC_FILE_CV else: dir_src = const.DIR_DATASET file_training_dst = const.FILE_TRAINING file_test_dst = const.FILE_TEST file_cv_dst = const.FILE_CV training, cv, test = util.split_data(df, train_perc=const.TRAINING_PERC, cv_perc=const.CV_PERC, test_perc=const.TEST_PERC) training.to_csv(dir_src + '/' + file_training_dst, index=False) test.to_csv(dir_src + '/' + file_test_dst, index=False) cv.to_csv(dir_src + '/' + file_cv_dst, index=False)
def load_data(data_file): with open(data_file, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=',') x = [] y = [] for row in csvreader: x_conv = (list(map(float, row[:-1]))) y_conv = ([float(row[-1])]) x.append(x_conv) y.append(y_conv) #print(x_conv) #print(y_conv) #y=relabel(y) x, y = util.shuffle_data(x, y) x_train, x_test, y_train, y_test = util.split_data(x, y, 0.9) return x_train, x_test, y_train, y_test print('[ERR] Failed to load data from file \'{0}\''.format(data_file)) exit()
def execute(data): """ :param data: Raw Data frame parsed from CSV :return: Nothing """ # 2. Randomizes the data randomized_data = util.randomize_data(data) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing training_data_size = 2.0 / 3.0 training_data, test_data = util.split_data(randomized_data, training_data_size) # Capture the predicted outputs training_outputs = training_data[training_data.columns[-1]] # 4. Standardizes the data (except for the last column of course) using the training data training_inputs, training_mean, training_std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front training_inputs.insert(0, "Bias", 1) # 5. Computes the closed-form solution of linear regression weights = find_weights(training_inputs, training_outputs) # 6. Applies the solution to the testing samples test_input = util.get_features(test_data) expected = util.get_output(test_data) actual = apply_solution(test_input, training_mean, training_std, weights) # 7. Computes the root mean squared error (RMSE) rmse = util.compute_rmse(expected, actual) return weights, rmse
b.set_user_inter_propensity(ut1, mt1, 0.5) b.set_user_inter_propensity(ut2, mt2, 0.5) b.set_user_inter_propensity(ut3, mt3, 0.5) b.set_user_inter_propensity(ut4, mt4, 0.99) b.set_user_inter_propensity(ut5, mt5, 0.5) rows = [] rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100)) rows += ut.unzip(b.gen_random_rows(2000)) train, test = ut.split_data(rows, 0.7, 0.3) test_users = map(lambda (u, m, r): u, test) op = KNNOptimizer() op.set_data_rows(train) op.set_similarity_f(match_count) best_msgs = su.n_best_messages(test_users, b, 100, 15) msgs = su.n_best_messages(test_users, b, 100, 100) ctrl_1 = lambda u: best_msgs[0] ctrl_2 = lambda u: rd.sample(msgs, 1)[0] ctrl_3 = lambda u: rd.sample(best_msgs, 1)[0] asf_1 = build_weighted_mode_selector(lambda x: 1) asf_2 = build_weighted_mode_selector(lambda x: 10**x) f_3_1 = lambda u: op.optimize(u, 3, asf_1)
import sys from util import ( subsample_raw_data, split_data, get_feat_dict, ) if __name__ == '__main__': if not os.path.isdir('raw_data') or \ not os.path.exists('raw_data/train.txt') or \ not os.path.exists('raw_data/test.txt'): print("Please put raw data in '/raw_data'/train.txt and '/raw_data'/test.txt") if not os.path.isdir('train_data'): os.mkdir('train_data') if not os.path.isdir('test_data'): os.mkdir('test_data') if not os.path.isdir('aid_data'): os.mkdir('aid_data') data_size = 100000 if os.path.exists('subsampled_raw_data/subsampled_train_' + str(data_size) + ".txt"): os.remove('subsampled_raw_data/subsampled_train_' + str(data_size) + ".txt") if os.path.exists('train_data/train.txt'): os.remove('train_data/train.txt') if os.path.exists('test_data/test.txt'): os.remove('test_data/test.txt') subsample_raw_data(data_size) split_data(data_size) get_feat_dict() print('Done!')
def train_network(sess, x, y, cfg): # Alias our training config to reduce code t_cfg = cfg['nn'] # Alias config vars to reduce code neurons = t_cfg['parameters']['neurons'] epochs = t_cfg['parameters']['epochs'] learning_rate = t_cfg['parameters']['learning_rate'] err_thresh = t_cfg['error_threshold'] model_dir = t_cfg['model_dir'] avg_factor = t_cfg['avg_factor'] save_epoch = t_cfg['save_epoch'] valid_thresh = t_cfg['valid_threshold'] print( '[ANN] \tTraining parameters: epochs={0}, learning_rate={1:.2f}, neurons={2}' .format(epochs, learning_rate, neurons)) # Create validation set x_train, x_valid, y_train, y_valid = util.split_data(x, y, 0.9) x_valid, y_valid = util.shuffle_data(x_valid, y_valid) # Create placeholders for tensors x_ = tf.placeholder(tf.float32, [None, 22], name='x_placeholder') y_ = tf.placeholder(tf.float32, [None, 1], name='y_placeholder') # Generate new random weights for new network weights = { 'fc1': tf.Variable(tf.random_normal([22, neurons]), name='w_fc1'), 'fc2': tf.Variable(tf.random_normal([neurons, neurons]), name='w_fc2'), 'fc3': tf.Variable(tf.random_normal([neurons, 1]), name='w_fc3'), } # Generate new random biases for new network biases = { 'fc1': tf.Variable(tf.random_normal([neurons]), name='b_fc1'), 'fc2': tf.Variable(tf.random_normal([neurons]), name='b_fc2'), 'fc3': tf.Variable(tf.random_normal([1]), name='b_fc3'), } # Construct our network and return the last layer to output the result final_layer = construct_network(x_, weights, biases, neurons) # Define error function cost_train = tf.reduce_mean( tf.losses.mean_squared_error(labels=y_, predictions=final_layer)) cost_valid = tf.reduce_mean( tf.losses.mean_squared_error(labels=y_, predictions=final_layer)) # Define optimiser and minimise error function task optimiser_train = tf.train.GradientDescentOptimizer( learning_rate=learning_rate).minimize(cost_train) optimiser_valid = tf.train.GradientDescentOptimizer( learning_rate=learning_rate).minimize(cost_valid) # Initialise global variables of the session sess.run(tf.global_variables_initializer()) # Create error logging storage train_errors = [] valid_errors = [] # Setup our continous plot fig = plt.figure() plt.title('Error vs Epoch') plt.plot(train_errors[:epochs], color='r', label='training') plt.plot(valid_errors[:epochs], color='b', label='validation') plt.xlabel('Epoch') plt.ylabel('Error') plt.legend() plt.grid() plt.ion() plt.show() # Measure training time t_start = time.time() diff_err = 1. vel_err = 0. acc_err = 0. # Generate a new random model name for new network model model_name = ''.join( random.choice(string.ascii_lowercase + string.digits) for _ in range(4)) for i in range(epochs): # Run network on training and validation sets _, train_error = sess.run([optimiser_train, cost_train], feed_dict={ x_: x_train, y_: y_train }) _, valid_error = sess.run([optimiser_train, cost_train], feed_dict={ x_: x_valid, y_: y_valid }) # If we're at a save epoch, save! if i % save_epoch == 0: model = util.save_model( sess, weights, biases, neurons, train_errors, os.path.join(model_dir, model_name + "_model")) # Add new errors to list train_errors.append(train_error) valid_errors.append(valid_error) # If we have at least an averageable amount of samples if i > avg_factor: avg_train_error = 0 avg_valid_error = 0 # Get sum over last n epochs for j in range(0, avg_factor): avg_train_error += train_errors[i - j] avg_valid_error += valid_errors[i - j] # Average them avg_train_error /= avg_factor avg_valid_error /= avg_factor # Calculate change in velocity of error difference acc_err = vel_err - (diff_err - abs(avg_valid_error - avg_train_error)) # Calculate change in error difference (positive -> convergence, negative -> divergence) vel_err = diff_err - abs(avg_valid_error - avg_train_error) # Calculate error difference between validation and training diff_err = abs(avg_valid_error - avg_train_error) # print('[ANN] Epoch: {0:4d}, Δerr = {1:7.4f}, 𝛿(Δerr) = {2:7.4f}, 𝛿(𝛿(Δerr)) = {3:7.4f}'.format(i, diff_err, vel_err, acc_err)) # DEBUG # If we already have our target error, terminate early if train_error <= err_thresh or (diff_err > valid_thresh and vel_err < 0.): break # Set plot settings if i > 0: plt.plot(train_errors[:epochs], color='r', label='training') plt.plot(valid_errors[:epochs], color='b', label='validation') plt.axis([0, i, 0., 1.]) plt.draw() plt.pause(0.001) plt.ioff() t_elapsed = time.time() - t_start # Calculate new simple accuracy from final error accuracy = 1 - train_error # Save model to file model = util.save_model(sess, weights, biases, neurons, train_errors, os.path.join(model_dir, model_name + "_model")) print('\n[ANN] Training Completed:') # Calculate number of minutes, seconds and milliseconds elapsed t_m = t_elapsed / 60 t_s = t_elapsed % 60 t_ms = (t_s % 1) * 1000 print('[ANN]\tModel name: {0}'.format(model_name)) print('[ANN]\tSimple model accuracy: {0:.3f}%'.format(accuracy * 100)) print('[ANN]\tTime elapsed: {0:2d}m {1:2d}s {2:3d}ms'.format( int(t_m), int(t_s), int(t_ms))) return model, model_name, { 'num_layers': len(weights), 'layer_width': neurons, 'learning_rate': learning_rate, 'time_to_train': t_elapsed, 'train_errors': [float(i) for i in train_errors], 'valid_errors': [float(i) for i in valid_errors] }
def get_bank_data(): features, resp = preprocess_bank_data() return split_data(features, resp)
import tensorflow as tf import pandas as pd from random import randint from model import train_function from util import split_data # load data images=np.load('/work/cse496dl/shared/homework/01/fmnist_train_data.npy') labels=np.load('/work/cse496dl/shared/homework/01/fmnist_train_labels.npy') #one hot encode labels labels_oh = np.zeros((labels.astype(int).size, labels.astype(int).max()+1)) labels_oh[np.arange(labels.size),labels.astype(int)] = 1 # split into train and test train_images, val_images, test_images = split_data(images, 0.7, 0.1, .2, 123) train_labels, val_labels, test_labels = split_data(labels_oh, 0.7, 0.1, .2, 123) #variables specification filepath='/work/cse496dl/dmle/' c=0 hiddenlayers=[5] batchsize=[128] learningrate=[0.001] regularization=[tf.contrib.layers.l2_regularizer(scale=0.01)] results=pd.DataFrame() for h in hiddenlayers: for b in batchsize: for l in learningrate:
import numpy as np from sklearn.tree import DecisionTreeRegressor import util # Loading and Cleaning x, y = util.load_inputs_and_outputs("data/ENB2012_data.csv") x_train, x_test, y_train, y_test = util.split_data(x, y) # Feature selection and Visualisation util.visualise(x_train, y_train) util.spearman(x_train, y_train) # Remove X8 from features as discussed in report x_train = np.delete(x_train, 7, 1) x_test = np.delete(x_test, 7, 1) # Training and optimisation util.plot_depth_accuracy(x_train, y_train) # Evaluation y1_model = DecisionTreeRegressor(max_depth=6, random_state=42) y1_model.fit(x_train, y_train[:, 0]) y2_model = DecisionTreeRegressor(max_depth=6, random_state=42) y2_model.fit(x_train, y_train[:, 1]) y1_test = y_test[:, 0]
def execute(data, training_data_ratio=2.0 / 3): """ Execute the Naive Bayes classification :param data: Dataframe containing training and test data :param training_data_ratio: :return: """ spam_class_name = 1 not_spam_class_name = 0 # 2. Randomize the data. print "Randomizing Data" randomized_data = util.randomize_data(data) # 3. Split the data in for training and testing print "Splitting Data for Test and Training" training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardize Training Data (except for class labels) print "Standardizing Training Data" training_features, training_data_target = util.split_features_target(training_data) std_training_features, mean, std = util.standardize_data(training_features) # 5. Divides the training data into two groups: Spam samples, Non-Spam samples. target_groups = training_data_target.groupby(training_data_target) total_training_size = float(len(training_data)) print "Computing probability of priors" data_class_probability = {class_name: len(target_group) / total_training_size for (class_name, target_group) in target_groups} # 6. Creates Normal models for each feature for each class. print "Creating normal models for each feature, for each class" models = {} for class_name, target_group in target_groups: models[class_name] = {} for feature_name in training_features.columns: dataset = std_training_features.loc[target_group.index][feature_name] feature_mean = dataset.mean() feature_std = dataset.std() models[class_name][feature_name] = {"mean":feature_mean, "standard_deviation": feature_std} # 7. Classify each testing sample using these models and choosing the class label based # on which class probability is higher. print "Evaluating models for each test data point" test_features, test_targets = util.split_features_target(test_data) std_test_features, _, _ = util.standardize_data(test_features, mean, std) true_positives = 0 true_negatives = 0 false_positives = 0 false_negatives = 0 for i in xrange(len(std_test_features)): probability_per_class = compute_posterior(models, data_class_probability, std_test_features.iloc[i]) # Select the class label of the class with highest probability assigned_class = max(probability_per_class.iteritems(), key=operator.itemgetter(1))[0] expected_class = test_targets.iloc[i] # Tally up each of our counters for performance measurements if expected_class == spam_class_name: if assigned_class == spam_class_name: true_positives += 1 else: # assigned_class == not_spam_class_name false_negatives += 1 else: # expected_class == not_spam_class_name if assigned_class == not_spam_class_name: true_negatives += 1 else: # assigned_class == spam_class_name false_positives += 1 # 8. Computes the statistics using the testing data results metrics = BinaryClassifierMetric(true_positives, false_positives, true_negatives, false_negatives) return metrics
mt5 = {'IA_2':'L_4', 'IA_4':'L_3'} b.set_user_inter_propensity(ut1, mt1, 0.5) b.set_user_inter_propensity(ut2, mt2, 0.5) b.set_user_inter_propensity(ut3, mt3, 0.5) b.set_user_inter_propensity(ut4, mt4, 0.99) b.set_user_inter_propensity(ut5, mt5, 0.5) rows = [] rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100)) rows += ut.unzip(b.gen_random_rows(2000)) log = su.BasicLogger() recorder = su.ScenarioRecorder() # Split data into train, calibration, and test. train, calibrate, test = ut.split_data(rows, 0.5, 0.25, 0.25) calibration_users = map(lambda (u, m, r): u, calibrate) test_users = map(lambda (u, m, r): u, test) controls = su.build_std_control_solvers(calibrate, b, 100, 15) treatments = su.build_std_knn_optims(train, calibrate, b, recorder, 1, 15) solvers = controls + treatments su.execute_trial(train, test_users, b, solvers, recorder, logger = log)
b.set_user_inter_propensity(ut1, mt1, 0.5) b.set_user_inter_propensity(ut2, mt2, 0.5) b.set_user_inter_propensity(ut3, mt3, 0.5) b.set_user_inter_propensity(ut4, mt4, 0.99) b.set_user_inter_propensity(ut5, mt5, 0.5) rows = [] rows += ut.unzip(b.gen_random_rows_from_template(ut1, mt1, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut2, mt2, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut3, mt3, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut4, mt4, 100)) rows += ut.unzip(b.gen_random_rows_from_template(ut5, mt5, 100)) rows += ut.unzip(b.gen_random_rows(1500)) train, test = ut.split_data(rows, 0.95, 0.05) test_users = map(lambda (u, m, r): u, test) op = KNNOptimizer() op.set_data_rows(train) op.set_distance_f(hamming) best_msgs = su.n_best_messages(test_users, b, 100, 15) msgs = su.n_best_messages(test_users, b, 100, 100) ctrl_1 = lambda u: best_msgs[0] ctrl_2 = lambda u: rd.sample(msgs, 1)[0] ctrl_3 = lambda u: rd.sample(best_msgs, 1)[0] knn_k3_f1 = lambda u: op.optimize(u, 3, op.f1) knn_k6_f1 = lambda u: op.optimize(u, 6, op.f1) knn_k9_f1 = lambda u: op.optimize(u, 500, op.f1) knn_k3_f2 = lambda u: op.optimize(u, 3, op.f2)