def get_error(self, data_set: DataSet): squared_sum = 0 # Sum up the squared sup across all squared differences between the actual class value and the expected value. for example_array, expected_class in data_set.get_data(): output = self.run(example_array) squared_sum += (output - expected_class)**2 return math.sqrt(squared_sum) / len(data_set.get_data())
def mds_and_plot(model): data = DataSet() x, y, data_list = data.get_test_frames('train') custom_model = Model(inputs=model.input, outputs=model.get_layer('dense_1').output) y_pred = custom_model.predict(x) mds = MDS() mds.fit(y_pred) a = mds.embedding_ mark = ['or', 'ob', 'og', 'oy', 'ok', '+r', 'sr', 'dr', '<r', 'pr'] color = 0 j = 0 for item in y: index = 0 for i in item: if i == 1: break index = index + 1 plt.plot([a[j:j + 1, 0]], [a[j:j + 1, 1]], mark[index], markersize=5) print(index) j += 1 plt.show()
def get_accuracy(self, data_set: DataSet): correct = 0 # Sum the number of correctly classified examples. for example_array, expected_class in data_set.get_data(): output = self.run(example_array) if output == expected_class: correct += 1 # Divide the number of correct examples by the total number of examples. return correct / len(data_set.get_data())
def main(input_train, input_test, output_train, output_test): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(input_train, input_test) df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() TitanicPreProcessing(df_train, output_train) TitanicPreProcessing(df_test, output_test)
def test_rnn(src, model): data = DataSet(src) x, y, data_list = data.get_test_frames('train') s = time.clock() y_pred = model.predict(x) e = time.clock() print(e - s) y_pred[y_pred < 0.7] = 0 y_pred[y_pred >= 0.7] = 1 print(metrics.precision_score(y, y_pred, average='micro', zero_division=0)) print(metrics.precision_score(y, y_pred, average='macro', zero_division=0)) print(metrics.recall_score(y, y_pred, average='micro', zero_division=0)) print(metrics.recall_score(y, y_pred, average='macro', zero_division=0)) print(metrics.f1_score(y, y_pred, average='weighted', zero_division=0))
def main(input_data, output_model): """ Runs modeling scripts using processed data (../raw) to create model. Model is saved as pickle (saved in ../models). """ logger = logging.getLogger(__name__) logger.info('training model') data = DataSet(train_dir=input_data) train = data.get_train_set() X_train = data.get_features(train) y = data.get_label(train) clf = models[4] param_grid = params[4] model = Model.tune(clf, X_train, y, param_grid) model.save(output_model + model.name)
def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(train_dir=input_filepath + '/train.csv', test_dir=input_filepath + '/test.csv') cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv', test_dir=output_filepath + '/test_clean.csv') df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived') df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived') cleaning.processed_train_data(df_train_clean) cleaning.processed_test_data(df_test_clean)
def extract_features(): # Get the dataset. data = DataSet() # get the model. model = Extractor(SAVED_CNN_EXTRACTOR_MODEL) if not os.path.exists(PROCESSED_SEQUENCES_DATA_DIR): os.makedirs(PROCESSED_SEQUENCES_DATA_DIR) # Loop through data. folders = ['train', 'test'] # folders = ['train'] for folder in folders: print(f'Extracting features from {folder} videos...') video_filenames = list(data.data[folder].keys()) # video_filenames=['171'] pbar = tqdm(total=len(video_filenames)) for video_filename in video_filenames: # Get the path to the sequence for this video. path = os.path.join(PROCESSED_SEQUENCES_DATA_DIR, video_filename + '-features') # numpy will auto-append .npy # Check if we already have it. if os.path.isfile(path + '.npy'): pbar.update(1) continue # Get the frames for this video. frames = data.get_frames_paths(folder, video_filename) # Now loop through and extract features to build the sequence. sequence = [] for image in frames: features = model.extract(image) sequence.append(features) # Save the sequence. np.save(path, sequence) pbar.update(1) pbar.close()
def get_generators(): dataset = DataSet() params = {'batch_size': 256, 'shuffle': True} # Generators train_generator = SequenceDataGenerator(dataset, "train", **params) valid_generator = SequenceDataGenerator(dataset, "test", **params) return train_generator, valid_generator
def main(input_train, input_test, input_model, output_prediction): """ Runs modeling scripts using model pickle (../models) to predict outcomes. Outcomes file is saved as .csv (saved in ../models). """ logger = logging.getLogger(__name__) logger.info('predicting outcomes') data = DataSet(train_dir=input_train, test_dir=input_test) test = data.get_test_set() X_test = data.get_features(test) model = Model.load(input_model + 'XGBClassifier') y_pred = model.predict(X_test) output = pd.DataFrame({ 'PassengerId': test['PassengerId'], 'Survived': y_pred }) output.to_csv(output_prediction + 'submission_{}.csv'.format(model.name), index=False)
def train(batch_size, nb_epoch, data_type, seq_len, categories, feature_len, saved_model=None): checkpointer = ModelCheckpoint( # filepath=os.path.join(settings.OUTPUT_CHECKPOINT_FOLDER, model + '.{epoch:03d}-{val_loss:.3f}.hdf5'), filepath=os.path.join(util.OUTPUT_CHECKPOINT_FOLDER, 'v1.hdf5'), verbose=1, save_best_only=True) # Helper: TensorBoard tb = TensorBoard(log_dir=util.OUTPUT_LOG) # Helper: Stop when we stop learning. # early_stopper = EarlyStopping(patience=5) # Helper: Save results. timestamp = time.time() csv_logger = CSVLogger(os.path.join(util.OUTPUT_LOG, 'training-' + str(timestamp) + '.log')) data = DataSet(util.SCRIPT_EXTRACT_SEQ_SPLIT_PATH) # Get samples per epoch. # Multiply by 0.7 to attempt to guess how much of data.data is the train set. steps_per_epoch = data.len_data() / batch_size generator = data.frame_generator(batch_size, 'train') val_generator = data.frame_generator(batch_size, 'valid') # Get the model. model = MLModel(len(categories), data_type, seq_len, saved_model, feature_len) rm = model.create_pre_train_model() # rm = em.create_model() rm.model.fit_generator( generator=generator, steps_per_epoch=steps_per_epoch, epochs=nb_epoch, verbose=1, callbacks=[tb, csv_logger, checkpointer], validation_data=val_generator, validation_steps=200 / batch_size)
def _extract_private_test_video_filenames(): all_video_filenames = set(os.listdir(PROCESSED_FRAMES_DATA_DIR)) dataset = DataSet() train_test_video_filenames = set(dataset.data['train'].keys()) | set( dataset.data['test'].keys()) private_test_video_filenames = all_video_filenames - train_test_video_filenames private_test_video_filenames = [ video_filename for video_filename in private_test_video_filenames if not video_filename[0] == '.' ] return private_test_video_filenames
def test_get_num_frames(self): video_filename = "79-30-960x720" expected = len(DataSet.get_targets('test', video_filename)) actual = VideoHelper._extract_num_frames(video_filename) self.assertEqual(expected, actual)
""" Create histograms of different subsets of Aff-Wild2 dataset. By choosing 'balancing_mode' we can visualize whole dataset or downsampled subset. By choosing 'train_test' we can visualize train or test samples of the dataset. """ from pylab import * from src.data import DataSet dataset = DataSet() # valences,arousals = dataset.get_val_ar(balancing_mode='balanced',max_mode='mean') valences, arousals = dataset.get_val_ar(balancing_mode='all', train_test='test') # valences,arousals = dataset.get_val_ar(mode='neg_ar_pos_val') print(len(valences)) print(len(arousals)) res_hist = hist2d(valences, arousals, bins=40, cmap=cm.jet) density = res_hist[0] / len(valences) s = np.sum(density) colorbar().ax.tick_params(axis='y', direction='out') # savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/b_mean_train_frames.png", bbox_inches='tight') # savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/train_neg_400_train_frames.png", bbox_inches='tight') # savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/train_neg_ar_pos_val_400_train_frames.png", bbox_inches='tight') savefig( "/Users/denisrangulov/Google Drive/EmotionRecognition/figures/all_test_frames.png", bbox_inches='tight')
def get_main_params(train_test): dataset = DataSet() list_IDs, targets = dataset.get_partition(train_test, balanced=True) return list_IDs, targets, train_test
vbar = tqdm(total=len(private_test_video_filenames)) for video_filename in private_test_video_filenames: prediction_path = os.path.join(PREDICTIONS, video_filename + '.txt') # Check if we already have it. if os.path.isfile(prediction_path + '.txt'): vbar.update(1) continue num_frames = video_helper.get_num_frames(video_filename) predictions = np.full((num_frames, 2), -5, dtype=np.float32) sequence = [] fbar = tqdm(total=num_frames) for frame_idx in range(num_frames): frame_path = DataSet.get_frame_path(video_filename, frame_idx) if os.path.isfile(frame_path): feature_vector = cnn_extractor_model.extract(frame_path) sequence.append(feature_vector) elif len(sequence) > 0: # Uncomment to predict first less than 'RNN_WINDOW_SIZE' frames using CNN # num_cnn_predictions = min(len(sequence), RNN_WINDOW_SIZE) # x = np.asarray(sequence[:num_cnn_predictions]) # prediction = cnn_extractor_model.predict(x) # predictions[frame_idx - len(sequence):frame_idx - len(sequence) + len(prediction)] = prediction if len(sequence) > RNN_WINDOW_SIZE: x = prepare_sequence_for_rnn(sequence) prediction = rnn_model.predict(x) predictions[frame_idx - len(prediction):frame_idx] = prediction sequence = [] if frame_idx == num_frames - 1 and len(sequence) > 0:
import numpy as np import pandas as pd import tensorflow as tf from src import metrics from src.config import PREDICTIONS from src.data import DataSet video_filename = "79-30-960x720" path = os.path.join(PREDICTIONS, video_filename + '.txt') # numpy will auto-append .npy pred_df = pd.read_csv(path, sep=",") pred_df[pred_df['valence'] == -5] = np.nan # pred_df = pred_df.interpolate(method='linear', axis=0).fillna(-5) # pred_df = pred_df.interpolate(method='linear', axis=0).fillna(0) pred_df = pred_df.interpolate(method='linear', axis=0).ffill().bfill() pred_df = pred_df.ex pred = pred_df[['valence', 'arousal']].values true = DataSet.get_targets('test', video_filename) r = len(true) if len(pred) > len(true) else len(pred) pred = tf.convert_to_tensor(pred[:r], np.float32) true = tf.convert_to_tensor(true[:r], np.float32) ccc_v = metrics.ccc_v(true, pred) ccc_a = metrics.ccc_a(true, pred) print(f'ccc_v: {ccc_v}, ccc_a: {ccc_a}')