def train_one_epoch(model, param_dict, input_names, output_name, X_shuffled, Y_shuffled): batch_size = int(param_dict['batch_size']) l = len(X_shuffled) train_loss = 0 train_acc = 0 for i in range(0, l, batch_size): batch_end = min(i + batch_size, l) Xs = X_shuffled[i:batch_end] Ys = Y_shuffled[i:batch_end] batchsize = len(Xs) batch = gd.get_batch(Xs, param_dict) batch = da.augment_data(batch, param_dict, "train") #If single stream model, we have 1 input_name, otherwise 2 fit_input = make_model_input_dict(input_names, batch) # Train on single batch history = model.fit(fit_input, {output_name: Ys}, batch_size=batchsize) train_loss += float(history.history['loss'][0]) * (batchsize / l) train_acc += float(history.history['acc'][0]) * (batchsize / l) return model, train_loss, train_acc
def validate_one_epoch(model, param_dict, input_names, output_name, X_val, Y_val): X_val_data = gd.get_batch(X_val, param_dict) X_val_augmented = da.augment_data(X_val_data, param_dict, "val") # Evaluate on validation data print("Evaluating On Validation Data...") fit_input = make_model_input_dict(input_names, X_val_augmented) val_loss, val_acc = model.evaluate(fit_input, {output_name: Y_val}) return val_loss, val_acc
def test_augment_data(self): original_data = [ np.random.rand(128, 3).tolist(), np.random.rand(66, 2).tolist(), np.random.rand(9, 1).tolist() ] original_label = ["data", "augmentation", "test"] augmented_data, augmented_label = augment_data(original_data, original_label) self.assertEqual(25 * len(original_data), len(augmented_data)) self.assertIsInstance(augmented_data, list) self.assertEqual(25 * len(original_label), len(augmented_label)) self.assertIsInstance(augmented_label, list) for i in range(len(original_label)): self.assertEqual(augmented_label[25 * i], original_label[i])
def get_data_file(self, data_path, data_type): """Get train, valid and test data from files.""" data = [] label = [] with open(data_path, "r") as f: lines = f.readlines() for idx, line in enumerate(lines): # pylint: disable=unused-variable dic = json.loads(line) data.append(dic[DATA_NAME]) label.append(dic[LABEL_NAME]) if data_type == "train": data, label = augment_data(data, label) length = len(label) print(data_type + "_data_length:" + str(length)) return data, label, length
from pydash import flatten from sklearn.model_selection import ShuffleSplit from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib from sklearn.metrics import classification_report from sklearn.pipeline import Pipeline from data.utils import load_data from preprocessing import preprocess_data from visualization import plot_learning_curves, get_errors_input from metrics import custom_map_at_k from feature_selection import get_features_extractor from data_augmentation import augment_data print('Augmenting training data set') augment_data('train.csv', 'train_augmented.csv') print('Loading training and testing set') train_data = load_data('train_augmented.csv') test_data = load_data('test.csv') print('Preprocessing') X_train, Y_train = preprocess_data(train_data) X_test, Y_test = preprocess_data(test_data) model_name = 'lr' # print('Loading model') # model = joblib.load('./models/' + model_name + '_classifier.pkl') print('Fitting model') model = Pipeline([
def main(dataset_dir): augment_both = True # to augment the RGB and target (edge_map) image at the same time augment_data(base_dir=dataset_dir, augment_both=augment_both, use_all_type=True)
def train(self, \ xdata, ydata, zdata, x_lengths, y_lengths, \ xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, \ xxdata, yydata, zzdata, xx_lengths, yy_lengths, \ MAXITER): merged_sum = tf.summary.merge_all() # writer = tf.train.SummaryWriter("./logs/%s" % "modeldir", self.sess.graph_def) tf.initialize_all_variables().run() start_time = time.time() best_val_loss = 1e100 best_val_acc = 0.0 for ITER in range(MAXITER): total_acc = 0.0 print('**************EPOCH****************\n', str(ITER)) epoch_start_time = time.time() total_loss = 0 # xdata, ydata, zdata, x_lengths, y_lengths = joint_shuffle(xdata, ydata, zdata, x_lengths, y_lengths) for i in xrange(0, len(xdata), self.batch_size): x, y, z, xlen, ylen = xdata[i:i + self.batch_size], \ ydata[i:i + self.batch_size], \ zdata[i:i + self.batch_size], \ x_lengths[i:i + self.batch_size], \ y_lengths[i:i + self.batch_size] x, y, z, xlen, ylen = augment_data(x, y, z, xlen, ylen) feed_dict = {self.x: x, \ self.y: y, \ self.target: z, \ self.x_length:xlen, \ self.y_length:ylen, \ self.is_training:1, \ self.dropout_keep_prob:1 } att, _, loss, acc, summ = self.sess.run( [self.att, self.optim, self.loss, self.acc, merged_sum], feed_dict=feed_dict) total_loss += loss total_acc += acc print("Epoch Time: ", time.time() - epoch_start_time) total_loss = total_loss / float(len(xdata)) total_acc = total_acc / float(len(xdata) / self.batch_size) print("Loss", total_loss, "Accuracy On Training", total_acc) total_val_loss, total_val_acc = self.validate( xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, ITER) if (best_val_loss >= total_val_loss or best_val_acc <= total_val_acc): if (best_val_loss >= total_val_loss): best_val_loss = total_val_loss if (best_val_acc <= total_val_acc): best_val_acc = total_val_acc self.test(xxdata, yydata, zzdata, xx_lengths, yy_lengths, ITER) elapsed_time = time.time() - start_time print("Total Time", elapsed_time)
def main(): """ Main del archivo, este archivo se encarga de preprocesar las noticias, "limpiar la data" en terminos generales. Hace las operaciones que estén definidas en el método transform_tring. Primero ejecuta la funcion transform string, luego calcula las salidas de los ejemplos, luego elimina las palabras menos frecuentes y por último guarda estos últimos datos. """ # dfr = pd.read_csv("newsDatabaseComplete14.csv", header=0, index_col=0) # dfr = pd.read_csv("newsDatabaseComplete14_filtered.csv", header=0, index_col=0) dfr = pd.read_csv("newsDatabaseComplete14_filtered_mixed.csv", header=0, index_col=0) # dfr = pd.read_csv("newsDatabaseComplete14_filtered_augmented.csv", header=0, index_col=0) words_in_glove = read_embedd_vectors( 0) ############# change for different embedding supported_langs = ['en'] classes = [-1.0, 0.0, 1.0] # eliminate non-classes examples dfr.dropna(subset=['classes'], inplace=True) dfr.index = np.arange(dfr.shape[0]) dftr, dfte = split_uniformly(dfr, 0.8, classes) # augment_data import data_augmentation dftr, n_perms = data_augmentation.augment_data(dftr) # implmentation asking people for classes for i in range(dftr.shape[0]): lang = detect(dftr['content'][i]) if (lang in supported_langs): tmp = get_raw_data(dftr['title'][i], dftr['content'][i]) dftr.loc[i, 'content'] = transform_string(tmp, words_in_glove, lang, dftr['source'][i]) else: print('language: %s not supported. Notice id: %d' % (lang, i)) dftr.loc[i, 'content'] = '' for i in range(dfte.shape[0]): lang = detect(dfte['content'][i]) if (lang in supported_langs): tmp = get_raw_data(dfte['title'][i], dfte['content'][i]) dfte.loc[i, 'content'] = transform_string(tmp, words_in_glove, lang, dfte['source'][i]) else: print('language: %s not supported. Notice id: %d' % (lang, i)) dfte.loc[i, 'content'] = '' # train word_to_frecuency = get_word_to_frecuency(dftr['content']) # dfr = eliminate_less_frequent_words(dfr, 5, word_to_frecuency) dftr = eliminate_less_frequent_words(dftr, 5 * n_perms, word_to_frecuency) # eliminate empty strings from dataframe dftr['content'].replace('', np.nan, inplace=True) dftr.dropna(subset=['content'], inplace=True) dftr.index = np.arange(dftr.shape[0]) # formating problem with pytorch dftr['classes'].replace(1, 2, inplace=True) dftr['classes'].replace(0, 1, inplace=True) dftr['classes'].replace(-1, 0, inplace=True) # dfr.to_csv('data14Deps.csv') dftr.to_csv( 'data14Glove_train.csv') ########### change for different embedding # test word_to_frecuency = get_word_to_frecuency(dfte['content']) dfte = eliminate_less_frequent_words(dfte, 5, word_to_frecuency) # eliminate empty strings from dataframe dfte['content'].replace('', np.nan, inplace=True) dfte.dropna(subset=['content'], inplace=True) dfte.index = np.arange(dfte.shape[0]) # formating problem with pytorch dfte['classes'].replace(1, 2, inplace=True) dfte['classes'].replace(0, 1, inplace=True) dfte['classes'].replace(-1, 0, inplace=True) dfte.to_csv( 'data14Glove_test.csv') ########### change for different embedding vals = dftr.classes.value_counts() sns.barplot(x=[0, 1, 2], y=[vals[0], vals[1], vals[2]]) plt.show()
weights_pre = model.get_weights() model = tu.load_weights(model, param_dict, stream, False, "test") weights_after = model.get_weights() weight_names = [weight.name for layer in model.layers for weight in layer.weights] tu.check_weights(weights_pre, weights_after, weight_names) # If the frame at second 1 is used (time of shot in B3SD dataset), then # we only get that particular frame for each video. if sec1_frame: print("1sec_frame set, so choosing that frame for each video") result_file.write("1sec_frame set, so choosing that frame for each video\n") X = gd.get_batch(X, param_dict) augmented_X = da.augment_data(X, param_dict, "test") fit_input = tu.make_model_input_dict(input_names, augmented_X) predictions_categorical = model.predict(fit_input, verbose=1) print(predictions_categorical.shape) predictions = np.asarray([np.argmax(pred) for pred in predictions_categorical]) print(predictions.shape) acc, correct_predictions = tpu.calculate_accuracy(predictions, Y, len(predictions)) # If sec1_frame is false, then we select x frames from each video for a # more averaged guess for each video. else: print("Augmenting each input "+str(repeats)+" times")
print('Selecting and assigning validation set...') print('-' * 30) train_indices,val_indices = train_test_split(np.arange(trainingFeatures.shape[0]),\ test_size=pm.validation_fraction) valFeatures = trainingFeatures[val_indices] trainingFeatures = trainingFeatures[train_indices] valLabels = trainingLabels[val_indices] trainingLabels = trainingLabels[train_indices] # Augmenting the training data and adding this to the training data set if pm.data_augm: print('-' * 30) print('Augmenting data...') print('-' * 30) augm_trainingFeatures, augm_trainingLabels = \ augment_data(trainingFeatures, trainingLabels, pm.nb_augm_samples, pm.augm_transformations) trainingFeatures = np.concatenate( (trainingFeatures, augm_trainingFeatures), axis=0) trainingLabels = np.concatenate( (trainingLabels, augm_trainingLabels), axis=0) # Run the main function train_and_predict() # Calculating and saving run time end_time = datetime.now() total_time = time_diff_format(start_time, end_time) pm.time_list.append(total_time) # Save the data to an xlsx-file and an image. write_save_data()
initial_learning_rate = 0.001 cumulative_loss = 0.0 BATCH_SIZE = 20 TRAIN_SIZE = 60000 NUM_EPOCHS = 100 EPOCH_SIZE = TRAIN_SIZE / BATCH_SIZE num_iterations = int(NUM_EPOCHS * EPOCH_SIZE) # Run training loop with sess.as_default(): for i in range(1, num_iterations + 1): current_learning_rate = initial_learning_rate * (1.0 - i / (num_iterations + 5)) batch = mnist_data.train.next_batch(BATCH_SIZE) train, train_labels = da.augment_data(batch[0], batch[1], use_random_zoom=False, use_random_shift=False) _, loss_val = sess.run( [train_step, loss], feed_dict={ img: train, labels: train_labels, is_train: True, lr: current_learning_rate }) cumulative_loss = cumulative_loss + loss_val if i % EPOCH_SIZE == 0: print(str(cumulative_loss / EPOCH_SIZE)) cumulative_loss = 0.0 if i % (10 * EPOCH_SIZE) == 0:
sess.run(tf.global_variables_initializer( )) # initialize all global variables, which includes weights and biases # training start for epoch in range(0, NUM_EPOCHS): total_cost = 0 for i in range(0, int(NUM_EXAMPLES / BATCH_SIZE)): batch_x = get_batch( dataset_train_features, i, BATCH_SIZE) # get batch of features of size BATCH_SIZE batch_y = get_batch( dataset_train_labels, i, BATCH_SIZE) # get batch of labels of size BATCH_SIZE batch_x, batch_y = augment_data( batch_x, batch_y, augmentation_factor=1) # augment the data _, batch_cost = sess.run( [training, loss], feed_dict={ x: batch_x, y: batch_y }) # train on the given batch size of features and labels total_cost += batch_cost if i % 25 == 0: print(i) print("Epoch:", epoch, "\tCost:", total_cost) # predict validation accuracy after every epoch sum_accuracy_validation = 0.0 sum_i = 0
weights_pre = model_part1.get_weights() model_part1 = tu.load_weights(model_part1, param_dict_part1, stream_part1, True, "test") weights_after = model_part1.get_weights() weight_names = [weight.name for layer in model_part1.layers for weight in layer.weights] tu.check_weights(weights_pre, weights_after, weight_names) # If the frame at second 1 is used (time of shot in B3SD dataset), then # we only get that particular frame for each video. print("1sec_frame set, so choosing that frame for each video") result_file.write("1sec_frame set, so choosing that frame for each video\n") X2_data = gd.get_batch(X2, param_dict_part1) augmented_X2 = da.augment_data(X2_data, param_dict_part1, "test") fit_input2 = tu.make_model_input_dict(input_part1, augmented_X2) predictions_categorical2 = model_part1.predict(fit_input2, verbose=1) print(predictions_categorical2.shape) predictions2 = np.asarray([np.argmax(pred) for pred in predictions_categorical2]) print(predictions2.shape) acc2, correct_predictions2 = tpu.calculate_accuracy(predictions2, Y2, len(predictions2)) correct=0 ones = 0 actual_shots=0 actual_no_shots=0