def xval(self, datafp): """Run a cross-validation test on the given data. Keyword Arguments :param datafp: the system filepath of the CSV dataset """ if self.rand: searchname = f'RANDOM SEARCH on the model "{self.model}" ' else: searchname = f'Exhaustive GRID SEARCH on the model "{self.model}" ' logger.info(searchname + f'for {len(self.param_grid)} parameter combinations ' f'with {self.repeat} repetition') datafp = os.path.join(self.path, datafp) self.x_arys, self.y_arys = load_data(datafp, target='all', profile=False) self.x_arys_p, self.y_arys_p = load_data(datafp, target='all', profile=True) trials = Trials() best = fmin(self._run_xval, space=get_param_dists(self.model), algo=tpe.suggest, max_evals=self.evals, trials=trials, verbose=1) with open(f'{self.basename}.pkl', 'wb') as f: pickle.dump(trials, f) logger.info(f'the best parameters are: {best}') logger.info(f'the f1 of which is: {trials.best_trial["result"]}')
def fixedf(self, trainfp, testfp): """Run a standard train/test cycle on the given data. Keyword Arguments :param trainfp: the system filepath of the CSV training dataset :param testfp: the system filepath of the CSV testing dataset :return: macro-F score average and std-dev """ logger.info(f'training {self.model} for {self.repeat} iterations') trainfp = os.path.join(self.path, trainfp) testfp = os.path.join(self.path, testfp) fmacro_list = [] x_train_arys, y_train_arys = \ load_data(trainfp, target=self.target, profile=self.profile) x_test_arys, y_test_arys = \ load_data(testfp, target=self.target, profile=self.profile) for i in range(self.repeat): if self.repeat > 1: logger.info(f'iteration: {i+1}') fmacro = run_fixed_fast(self.model, x_train_arys, y_train_arys, x_test_arys, y_test_arys, self.wvfp, self.profile, params=self.params) fmacro_list.append(fmacro) average = np.average(fmacro_list) stdev = np.std(fmacro_list) logger.info(f'total iterations: {self.repeat}; ' f'fmacro average: {average:.4}; ' f'fmacro std dev: {stdev:.4}' ) return average, stdev
def fixed(self, trainfp, testfp): """Run a standard train/test cycle on the given data. Keyword Arguments :param trainfp: the system filepath of the CSV training dataset :param testfp: the system filepath of the CSV testing dataset """ logger.info(f'BAYESIAN SEARCH on the model "{self.model}" ' f'for {self.evals} parameter combinations ' f'with {self.repeat} repetition') trainfp = os.path.join(self.path, trainfp) testfp = os.path.join(self.path, testfp) self.x_train_arys, self.y_train_arys = \ load_data(trainfp, target='all', profile=False) self.x_train_arys_p, self.y_train_arys_p = \ load_data(trainfp, target='all', profile=True) self.x_test_arys, self.y_test_arys = \ load_data(testfp, target='all', profile=False) self.x_test_arys_p, self.y_test_arys_p = \ load_data(testfp, target='all', profile=True) trials = Trials() best = fmin(self._run_fixed, space=get_param_dists(self.model), algo=tpe.suggest, max_evals=self.evals, trials=trials, verbose=1) with open(f'{self.basename}.pkl', 'wb') as f: pickle.dump(trials, f) logger.info(f'the best parameters are: {best}') logger.info(f'the f1 of which is: {trials.best_trial["result"]}')
def xval(self, datafp): """Run a cross-validation test on the given data. Keyword Arguments :param datafp: the system filepath of the CSV dataset """ if self.rand: searchname = f'RANDOM SEARCH on the model "{self.model}" ' else: searchname = f'Exhaustive GRID SEARCH on the model "{self.model}" ' logger.info( searchname + f'for {len(self.param_grid)} parameter combinations ' f'with {self.repeat} repetition') datafp = os.path.join(self.path, datafp) x_arys, y_arys = load_data( datafp, target='all', profile=False) x_arys_p, y_arys_p = load_data( datafp, target='all', profile=True) for params in self.param_grid: if filter_useless_combs(params): continue logger.info(str(params)) profile = params.pop('profile', None) if profile: fmacro_list = run_xval(self.model, x_arys_p, y_arys_p, self.wvfp, profile, cv=self.cv, params=params) else: fmacro_list = run_xval(self.model, x_arys, y_arys, self.wvfp, profile, cv=self.cv, params=params) self._result2csvrow(params, fmacro_list) self._csvf.close()
def xval(self, datafp, outfp): """Run a cross-validation test on the given data. Keyword Arguments :param datafp: the system filepath of the CSV dataset :return: macro-F score average and std-dev """ logger.info(f'running {self.model} X-val for {self.repeat} iterations') datafp = os.path.join(self.path, datafp) x_arys, y_arys = load_data( datafp, target=self.target, profile=self.profile) fmacro_list, df_report = run_xval(self.model, x_arys, y_arys, self.wvfp, self.profile, cv=self.cv, params=self.params) average = np.average(fmacro_list) stdev = np.std(fmacro_list) logger.info(f'total CV iterations: {self.cv}; ' f'target-wise average: {average:.4}; ' f'target-wise std dev: {stdev:.4}') if outfp: df_mean = pd.DataFrame(df_report.mean()).transpose() with open(outfp, "a") as f: df_mean.to_csv(f, header=False, index=False) return average, stdev
def fixed(self, trainfp, testfp): """Run a standard train/test cycle on the given data. Keyword Arguments :param trainfp: the system filepath of the CSV training dataset :param testfp: the system filepath of the CSV testing dataset """ logger.info( f'Exhaustive GRID SEARCH on the model "{self.model}" ' f'for {len(self.param_grid)} parameter combinations ' f'with {self.repeat} repetition') trainfp = os.path.join(self.path, trainfp) testfp = os.path.join(self.path, testfp) x_train_arys, y_train_arys = \ load_data(trainfp, target='all', profile=False) x_train_arys_p, y_train_arys_p = \ load_data(trainfp, target='all', profile=True) x_test_arys, y_test_arys = \ load_data(testfp, target='all', profile=False) x_test_arys_p, y_test_arys_p = \ load_data(testfp, target='all', profile=True) for params in self.param_grid: # TODO: error tolerance if filter_useless_combs(params): continue logger.info(str(params)) profile = params.pop('profile', None) fmacro_list: List[float] = [] for i in range(self.repeat): if self.repeat > 1: logger.info(f'iteration: {i+1}') if profile: fmacro = run_fixed_fast(self.model, x_train_arys_p, y_train_arys_p, x_test_arys_p, y_test_arys_p, self.wvfp, profile, params=params) else: fmacro = run_fixed_fast(self.model, x_train_arys, y_train_arys, x_test_arys, y_test_arys, self.wvfp, profile, params=params) fmacro_list.append(fmacro) self._result2csvrow(params, fmacro_list) self._csvf.close()
def train(self, trainfp, testfp, outfp=None, combined=False): """Run a standard train/test cycle on the given data. Keyword Arguments :param trainfp: the system filepath of the CSV training dataset :param testfp: the system filepath of the CSV testing dataset :param outfp: the system filepath to output in CSV format :param combined: whether or not to combine different companies into same train set :return: macro-F score average and std-dev """ logger.info(f'training {self.model} for {self.repeat} iterations') trainfp = os.path.join(self.path, trainfp) testfp = os.path.join(self.path, testfp) fmacro_list = [] x_train_arys, y_train_arys = \ load_data(trainfp, target=self.target, profile=self.profile) x_test_arys, y_test_arys = \ load_data(testfp, target=self.target, profile=self.profile) for i in range(self.repeat): if self.repeat > 1: logger.info(f'iteration: {i+1}') fmacro, df_report = run_train(self.model, x_train_arys, y_train_arys, x_test_arys, y_test_arys, self.wvfp, self.profile, params=self.params, combined=combined) fmacro_list.append(fmacro) average = np.average(fmacro_list) stdev = np.std(fmacro_list) logger.info(f'total iterations: {self.repeat}; ' f'fmacro average: {average:.4}; ' f'fmacro std dev: {stdev:.4}' ) df_report['combined'] = pd.Series(average) if outfp: with open(outfp, "a") as f: df_report.to_csv(f, header=False, index=False) return average, stdev
def main(): vocabulary_input = "abcdefghijklmnopqrtsuvwxyz" vocabulary_target = "0123" training_steps = 1000000 display_step = 1000 batch_size = 32 vocab_size = 27 input_embedded_size = 27 output_classes = 4 num_units = 1024 num_layers = 2 DROPOUT_KEEP_PROB_TRAIN = 1.0 DROPOUT_KEEP_PROB_TEST = 1.0 #file_name = "dataset_diff_lengths_test.txt" file_name = "datasetC.txt" #file_name = "dset.txt" ratio = np.array([0.8, 0.9]) columns = np.array([1, 2]) X_train, X_train_lengths, X_val, X_val_lengths, X_test, X_test_lengths, y_train, y_val, y_test, train_batch_sizes, test_batch_sizes = load_data( file_name, ratio, columns, batch_size) X_train_emb = emb_encode(X_train, vocabulary_input, train_batch_sizes, batch_size) X_test_emb = emb_encode(X_test, vocabulary_input, test_batch_sizes, batch_size) y_train_one_hot = one_hot_encode(y_train, vocabulary_target) y_train_one_hot = np.squeeze(y_train_one_hot, axis=1) y_test_one_hot = one_hot_encode(y_test, vocabulary_target) y_test_one_hot = np.squeeze(y_test_one_hot, axis=1) model = DynamicClassifier(vocab_size, input_embedded_size, output_classes, num_units, num_layers) iteration = 0 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) try: start = t.time() for step in range(1, training_steps + 1): batch_x = X_train_emb[batch_size * iteration:batch_size * (iteration + 1), :] batch_len_x = X_train_lengths[batch_size * iteration:batch_size * (iteration + 1)] batch_y = y_train_one_hot[batch_size * iteration:batch_size * (iteration + 1), :] iteration += 1 if iteration >= (X_train_emb.shape[0] / batch_size) - 1: iteration = 0 err, _, acc, = sess.run( [model.error, model.train, model.accuracy], feed_dict={ model.inputs: batch_x, model.outputs: batch_y, model.keep_prob: DROPOUT_KEEP_PROB_TRAIN, model.sequence_length: batch_len_x }) if step % display_step == 0 or step == 1: # Calculate batch loss and accuracy err, acc, = sess.run( [model.error, model.accuracy], feed_dict={ model.inputs: batch_x, model.outputs: batch_y, model.keep_prob: DROPOUT_KEEP_PROB_TRAIN, model.sequence_length: batch_len_x }) end = t.time() print("Step " + str(step) + ", Minibatch Loss= " + \ "{:.4f}".format(err) + ", Training Accuracy= " + \ "{:.3f}".format(acc)) print("Epoch Time: ", end - start) start = t.time() #print('step') except KeyboardInterrupt: print('training interrupted') batch_x = X_test_emb[0:5000, :] batch_len_x = X_test_lengths[0:5000] batch_y = y_test_one_hot[0:5000, :] acc = sess.run(model.accuracy, feed_dict={ model.inputs: batch_x, model.outputs: batch_y, model.keep_prob: DROPOUT_KEEP_PROB_TEST, model.sequence_length: batch_len_x }) print("Testing Accuracy:" + str(acc))