def get_data(class_type): file_list = os.listdir(f'./data/images/syn/{class_type}/true') if '.ipynb_checkpoints' in file_list: idx = file_list.index('.ipynb_checkpoints') file_list.pop(idx) true_paths = [ f'./data/images/syn/{class_type}/true/{file}' for file in file_list ] true_labels = [1 for i in range(len(true_paths))] file_list = os.listdir(f'./data/images/syn/{class_type}/false') if '.ipynb_checkpoints' in file_list: idx = file_list.index('.ipynb_checkpoints') file_list.pop(idx) false_paths = [ f'./data/images/syn/{class_type}/false/{file}' for file in file_list ] false_labels = [0 for i in range(len(false_paths))] labels = np.array(true_labels + false_labels) print(f'{class_type.upper()} Value Counts') print(pd.Series(labels).value_counts()) paths = np.array(true_paths + false_paths) #labels = to_categorical(labels) if class_type == 'train': paths, labels = sklearn_shuffle(paths, labels) return paths, labels
def __init__(self, encoder_inp_data, decoder_inp_data, target_data, batch_size): self.batch_size = batch_size # shuffle the data shuffled_enc_inp, shuffled_dec_inp, shuffled_target = sklearn_shuffle( encoder_inp_data, decoder_inp_data, target_data) # % 80 train , %20 train self.train_length = len(shuffled_enc_inp) * 80 // 100 #print("train_length : ", self.train_length) self.train_enc_inp = shuffled_enc_inp[:self.train_length] self.test_enc_inp = shuffled_enc_inp[self.train_length:] self.train_dec_inp = shuffled_dec_inp[:self.train_length] self.test_dec_inp = shuffled_dec_inp[self.train_length:] self.train_target = shuffled_target[:self.train_length] self.test_target = shuffled_target[self.train_length:] #self.sequence_length = slef.create_sequence_length(self.train_enc_inp, self.batch_size) # iterator self.iter = 0 self.test = 0
def train( self, training_trackers: List[TrackerWithCachedStates], domain: Domain, interpreter: NaturalLanguageInterpreter, **kwargs: Any, ) -> None: tracker_state_features, label_ids = self.featurize_for_training( training_trackers, domain, interpreter, **kwargs) training_data, zero_state_features = model_data_utils.convert_to_data_format( tracker_state_features) self.zero_state_features = zero_state_features self._train_params.update(kwargs) model = self.model_architecture(**self._train_params) score = None # Note: clone is called throughout to avoid mutating default arguments. self.label_encoder = clone(self.label_encoder).fit(label_ids) X = self._preprocess_data(training_data) y = self.label_encoder.transform(label_ids) if self.shuffle: X, y = sklearn_shuffle(X, y) if self.cv is None: model = clone(model).fit(X, y) else: param_grid = self.param_grid or {} model, score = self._search_and_score(model, X, y, param_grid) self.model = model logger.info("Done fitting sklearn policy model") if score is not None: logger.info(f"Cross validation score: {score:.5f}")
def __init__(self, datasets, labels, hdf5_file=None, batch_size=32, dim=(128, 128), shuffle=True, subset=None, random_seed=42): """ Initialize generator. Parameters: datasets (list): List of file paths to the NPZ files containing the images created using :func:`generate_train_images`. labels (list): List of integer labels corresponding to the images in ``filenames``. hdf5_file (h5py.File): Opened HDF5 file containing images as datasets. batch_size (int, optional): Batch size. dim (tuple, optional): Image/2D array dimensions. shuffle (bool, optional): Shuffle data after every epoch? subset (str, optional): random_seed (int, optional): Random seed for splitting and shuffeling. """ self.batch_size = batch_size self.filenames = datasets # Yes, we know this is badly named! self.labels = labels # for binary classification self.shuffle = shuffle # shuffles data after every epoch self.dim = dim # image/2D array dimensions self.seed = random_seed # Add random seed to ensure split for validation and training set is the same # Number of unique labels: self.num_classes = len(np.unique(labels)) # Open the HDF5 file in read-only mode: if isinstance(hdf5_file, h5py.File): self.hdf = hdf5_file else: self.hdf = h5py.File(hdf5_file, 'r') # Create list if indicies and optionally shuffle them: self.indexes = np.arange(len(self.filenames), dtype=int) if shuffle: self.indexes = sklearn_shuffle(self.indexes, random_state=self.seed) # This is a hacky way to do it, but the only way under the current framework if subset is not None: train_indices, valid_indices = train_test_split( self.indexes, test_size=0.2, # FIXME: Should the be allowed to change? stratify=labels, random_state=self.seed) if subset == 'train': self.indexes = train_indices elif subset == 'valid': self.indexes = valid_indices else: raise ValueError("subset keyword not set properly")
def _extract_training_data( self, training_data: DialogueTrainingData ) -> Tuple[np.ndarray, np.ndarray]: # transform y from one-hot to num_classes X, y = training_data.X, training_data.y.argmax(axis=-1) if self.shuffle: X, y = sklearn_shuffle(X, y) return X, y
def split_into_batches(self, X, y, size=16, shuffle=True): if shuffle: X, y = sklearn_shuffle(X, y, random_state=0) n, m = X.shape n_batches = ceil(n / size) X_b = [X[i * size: (i + 1) * size, :] for i in range(n_batches)] y_b = [y[i * size: (i + 1) * size] for i in range(n_batches)] return X_b, y_b
def create_data_generators(train_dir, val_dir, image_shape, batch_size): train_datagen = ImageDataGenerator(rotation_range=30, width_shift_range=0.125, height_shift_range=0.125, depth_shift_range=0.125, zoom_range=0.125, horizontal_flip=True, vertical_flip=False, depth_flip=False) val_datagen = ImageDataGenerator() train_neg_files = list(get_data_files(os.path.join(train_dir, '0'))) train_pos_files = list(get_data_files(os.path.join(train_dir, '1'))) total = len(train_neg_files) + len(train_pos_files) X_train = np.zeros((total, *image_shape)) for i, f in enumerate(train_neg_files + train_pos_files): patch = np.load(f) X_train[i, :, :, :] = patch y_train = np.hstack((np.zeros(len(train_neg_files)), np.ones(len(train_pos_files)))).astype(np.bool) X_train, y_train = sklearn_shuffle(X_train, y_train) val_neg_files = list(get_data_files(os.path.join(val_dir, '0'))) val_pos_files = list(get_data_files(os.path.join(val_dir, '1'))) total = len(val_neg_files) + len(val_pos_files) X_val = np.zeros((total, *image_shape)) for i, f in enumerate(val_neg_files + val_pos_files): patch = np.load(f) X_val[i, :, :, :] = patch y_val = np.hstack((np.zeros(len(val_neg_files)), np.ones(len(val_pos_files)))).astype(np.bool) X_val, y_val = sklearn_shuffle(X_val, y_val) train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size) val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size) return train_generator, val_generator
def _get_iterator(self, subfolder, epochs, batch_size, preprocessing_function, shuffle): filenames, labels = self._get_files_and_labels(subfolder) if shuffle: filenames, labels = sklearn_shuffle(filenames, labels) dataset = tf.data.Dataset.from_tensor_slices((filenames, labels)) dataset = dataset.repeat(epochs) dataset = dataset.map(_parse_function, num_parallel_calls=20) if preprocessing_function: dataset = dataset.map(preprocessing_function, num_parallel_calls=20) if batch_size: dataset = dataset.batch(batch_size) dataset = dataset.prefetch(10) iterator = dataset.make_initializable_iterator() return iterator
def load_most_rated_movies(self, n_movies, shuffle=False): """ Finds the n most rated movies. :param n_movies: number of movies :param shuffle: if True then result is shuffled :return: the n most rated movies, pd dataframe """ movies_df = self.load_movies() if n_movies <= 0 or n_movies > len(movies_df): raise RuntimeError("Invalid number of movies requested") most_rated_movie_ids = self.__load_most_rated_movie_ids(n_movies) most_rated_movies = movies_df.loc[movies_df["movieId"].isin( most_rated_movie_ids)].set_index("movieId") if shuffle: most_rated_movies = sklearn_shuffle(most_rated_movies) return most_rated_movies
def train(model, trainset_csr_pkl_path, labels_pkl_path=None, testset_csr_pkl_path=None, n_epoch=5, batch_size=256, train_set_percent=0.75, should_split_by_field=False, field_sizes_pkl_path=None, should_early_stop=True, early_stop_interval=10, batch_eval_interval=-1, should_dump_model=False, model_dump_path="", shuffle_trainset=True, eval_interval=1, train_log_path="", ctr_or_recommend=True, predict_batch_size=10000, min_rec_pred=1, max_rec_pred=5, **kwargs): util.log.log("Start to train model") util.log.log("Loading trainset and labels") if testset_csr_pkl_path is None: dataset = joblib.load(trainset_csr_pkl_path) labels = pd.read_csv(labels_pkl_path, header=None) train_set_size = int(train_set_percent * labels.shape[0]) util.log.log("Start to split trainset and testset") if not isinstance(dataset, list): train_set = dataset[:train_set_size] test_set = dataset[train_set_size:] else: train_set = [field[:train_set_size] for field in dataset] test_set = [field[train_set_size:] for field in dataset] util.log.log("Start to split trainset and testset labels") train_labels = labels[:train_set_size] test_labels = labels[train_set_size:] if not ctr_or_recommend: train_labels = np.clip(train_labels, min_rec_pred, max_rec_pred) test_labels = np.clip(test_labels, min_rec_pred, max_rec_pred) train_data = (train_set, train_labels) test_data = (test_set, test_labels) else: train_data = joblib.load(trainset_csr_pkl_path) test_data = joblib.load(testset_csr_pkl_path) util.log.log("Handling field size") field_sizes = joblib.load(field_sizes_pkl_path) \ if field_sizes_pkl_path is not None else None if field_sizes is not None: if should_split_by_field and not isinstance(train_data[0], list): field_idxs = util.preprocess.get_field_idxs_from_field_size( field_sizes) util.log.log("Spliting Data by field") train_data = util.train.split_data_by_field(train_data, field_idxs) test_data = util.train.split_data_by_field(test_data, field_idxs) history_infos = [] history_eval_scores = [] best_eval_score = -1 train_score = 999 test_score = 999 best_batch_eval_score = -1 for i in xrange(n_epoch): util.log.log("Train in epoch %d" % i) fetches = [model.optimizer, model.loss] losses = [] if batch_size > 0: losses = [] inst_size = train_data[0].shape[0] \ if not isinstance(train_data[0], list) else train_data[0][0].shape[0] n_iter = inst_size / batch_size if n_iter != float(inst_size) / batch_size: n_iter = n_iter + 1 if shuffle_trainset: shuffle_idxs = sklearn_shuffle(range(n_iter)) for j in xrange(n_iter): if j % 10000 == 0: util.log.log("Train in epoch %d iter %d" % (i, j)) idx = j if shuffle_trainset: idx = shuffle_idxs[j] X, y = util.train.slice( train_data, idx * batch_size, min(batch_size, inst_size - idx * batch_size)) _, loss = model.run(fetches, X, y) if batch_eval_interval > 0 and j % batch_eval_interval == batch_eval_interval / 2: train_preds = predict(model, train_data, predict_batch_size) test_preds = predict(model, test_data, predict_batch_size) if ctr_or_recommend: train_score = roc_auc_score(train_data[1], train_preds) test_score = roc_auc_score(test_data[1], test_preds) if best_batch_eval_score == -1 or test_score < best_batch_eval_score: best_batch_eval_score = test_score train_loss = log_loss(train_data[1], train_preds) test_loss = log_loss(test_data[1], test_preds) util.log.log( "[%d-%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f" % (i, j, np.mean(losses), train_score, test_score, train_loss, test_loss, best_batch_eval_score)) print "[%d-%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"\ %(i, j, np.mean(losses), train_score, test_score, train_loss, test_loss, best_batch_eval_score) else: train_preds = np.clip(train_preds, min_rec_pred, max_rec_pred) test_preds = np.clip(test_preds, min_rec_pred, max_rec_pred) train_score = np.sqrt( mean_squared_error(train_data[1], train_preds)) test_score = np.sqrt( mean_squared_error(test_data[1], test_preds)) if best_batch_eval_score == -1 or test_score < best_batch_eval_score: best_batch_eval_score = test_score util.log.log( "[%d-%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f" % (i, j, np.mean(losses), train_score, test_score, best_batch_eval_score)) print "[%d-%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"\ %(i, j, np.mean(losses), train_score, test_score, best_batch_eval_score) losses.append(loss) elif batch_size == -1: X, y = util.train.slice(train_data) _, loss = model.run(fetches, X, y) losses = [loss] if (i + 1) % eval_interval == 0: util.log.log("Evaluate in epoch %d" % i) train_preds = predict(model, train_data, predict_batch_size) util.log.log("Predict Test Set") test_preds = predict(model, test_data, predict_batch_size) util.log.log("Cal Evaluation") if ctr_or_recommend: train_score = roc_auc_score(train_data[1], train_preds) test_score = roc_auc_score(test_data[1], test_preds) if best_eval_score == -1 or test_score < best_eval_score: best_eval_score = test_score train_loss = log_loss(train_data[1], train_preds) test_loss = log_loss(test_data[1], test_preds) util.log.log( "[%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f" % (i, np.mean(losses), train_score, test_score, train_loss, test_loss, best_eval_score)) print "[%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"\ %(i, np.mean(losses), train_score, test_score, train_loss, test_loss, best_eval_score) else: train_preds = np.clip(train_preds, min_rec_pred, max_rec_pred) test_preds = np.clip(test_preds, min_rec_pred, max_rec_pred) train_score = np.sqrt( mean_squared_error(train_data[1], train_preds)) test_score = np.sqrt( mean_squared_error(test_data[1], test_preds)) if best_eval_score == -1 or test_score < best_eval_score: best_eval_score = test_score util.log.log( "[%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f" % (i, np.mean(losses), train_score, test_score, best_eval_score)) print "[%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"\ %(i, np.mean(losses), train_score, test_score, best_eval_score) else: if ctr_or_recommend: train_score = -1 test_score = -1 train_loss = -1 test_loss = -1 else: pass if ctr_or_recommend: history_infos.append({ "losses": losses, "avg-loss": np.mean(losses), "train-auc": train_score, "test-auc": test_score, "train-loss": train_loss, "test-loss": test_loss }) else: history_infos.append({ "losses": losses, "avg-loss": np.mean(losses), "train-rmse": train_score, "test-rmse": test_score, }) history_eval_scores.append(test_score) if ctr_or_recommend: best_test_auc_epoch = np.argmax(history_eval_scores) else: best_test_auc_epoch = np.argmin(history_eval_scores) if should_early_stop and i - best_test_auc_epoch >= early_stop_interval: print "Early stop\nbest iteration:\n[%d]\teval-auc: %f" % ( best_test_auc_epoch, history_eval_scores[best_test_auc_epoch]) break if should_dump_model: model.dump(model_dump_path) if len(train_log_path) != 0: json_log = { "conf": kwargs, "eval_log": history_infos, "best_eval_score": best_eval_score, "best_batch_eval_score": best_batch_eval_score } param_str = "" if kwargs['model_name'] == "biasedMF": param_str += "." + str(kwargs['model_params']['embd_size']) param_str += "." + str( kwargs['model_params']['learning_rate']).replace('.', 'p') param_str += "." + str(kwargs['model_params']['reg_rate']).replace( '.', 'p') else: param_str += "." + "_".join( [str(l) for l in kwargs['model_params']['layer_sizes'][1:]]) param_str += "." + str(kwargs['model_params']['layer_acts'][2]) param_str += "." + str( kwargs['model_params']['learning_rate']).replace('.', 'p') param_str += "." + str( kwargs['model_params']['kernel_l2']).replace('.', 'p') param_str += "." + str( kwargs['model_params']['layer_keeps'][1]).replace('.', 'p') param_str += "." + str(trainset_csr_pkl_path.split('/')[2]) if field_sizes is None: param_str += "." + str(1) else: param_str += "." + str(len(field_sizes)) if not trainset_csr_pkl_path.endswith(".pkl"): param_str += "." + trainset_csr_pkl_path[-5:].replace('.', 'p') train_log_path += param_str train_log_path += "." + str( min( best_eval_score, best_batch_eval_score if best_batch_eval_score != -1 else best_eval_score)).replace('.', 'p') train_log_path += "." + str(test_score).replace('.', 'p') fo = open(train_log_path, "w") json.dump(json_log, fo, indent=True, default=util.json_util.json_numpy_serialzer) fo.close() util.log.log("log json in %s" % train_log_path) return model
def _extract_training_data(self, training_data): # transform y from one-hot to num_classes X, y = training_data.X, training_data.y.argmax(axis=-1) if self.shuffle: X, y = sklearn_shuffle(X, y) return X, y
# Decode with subselecting neurons decode_subselects[j] = classify( pop_vector[np.ix_(incl_trials, np.isin(cluster_ids, use_neurons))], trial_ids, clf, cv)[0] null_iterations = np.empty(ITERATIONS) for k in range(ITERATIONS): # Estimate chance level if CHANCE_LEVEL == 'shuffle': null_iterations[k] = classify( pop_vector[np.ix_( incl_trials, np.isin(cluster_ids, use_neurons))], sklearn_shuffle(trial_ids), clf, cv)[0] elif CHANCE_LEVEL == 'pseudo-session': pseudo_trials = generate_pseudo_session(trials) _, pseudo_incl_trials, pseudo_trial_ids = trial_vectors( pseudo_trials, TARGET) null_iterations[k] = classify( pop_vector[np.ix_( pseudo_incl_trials, np.isin(cluster_ids, use_neurons))], pseudo_trial_ids, clf, cv)[0] elif CHANCE_LEVEL == 'none': null_iterations = [] else: raise Exception( 'CHANCE_LEVEL must be phase_rand, shuffle or none')
def _shuffle(self): self.X_u, self.y_u=self.rus.fit_resample(self.X,self.y) self.X_u,self.y_u = sklearn_shuffle(self.X_u,self.y_u)
# Decode if VALIDATION[6:] == 'interleaved': y_pred, y_probs = decode(this_pop_vector, trial_ids, NUM_SPLITS, True) else: y_pred, y_probs = decode(this_pop_vector, trial_ids, NUM_SPLITS, False) accuracy[k] = accuracy_score(trial_ids, y_pred) pred[k, :] = y_pred prob[k, :] = y_probs # Decode shuffled data if VALIDATION[6:] == 'interleaved': y_pred, y_probs = decode(this_pop_vector, sklearn_shuffle(trial_ids), NUM_SPLITS, True) else: y_pred, y_probs = decode(this_pop_vector, sklearn_shuffle(trial_ids), NUM_SPLITS, False) accuracy_shuffle[k] = accuracy_score(trial_ids, y_pred) pred_shuffle[k, :] = y_pred prob_shuffle[k, :] = y_probs # Get average probability per trial unique_trial_numbers = np.unique(trial_numbers) prob_per_trial = np.empty(unique_trial_numbers.shape[0]) for t, trial in enumerate(unique_trial_numbers): prob_per_trial[t] = np.mean(prob[np.where(trial_numbers == trial)])
def main(): args = parse_arguments() data_root = args.dataroot experiment_root = args.experiment_root # Set both the numpy and the Python random seeds. random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # Load data needed for training and save all parameters/mappings to make # sure experiments are reproducible questions_train_all, answers_train_all, images_train_all = load_train_data( data_root) # Since we are simplifying the problem of Visual QA to a classification # problem in this baseline, we want to limit the number of possible # answers, and have the model simply pick the most appropriate one. max_answers = 1000 questions_train_all, answers_train_all, images_train_all = \ select_frequent_answers(questions_train_all, answers_train_all, images_train_all, max_answers) # Encode the remaining (top max_answers) answers and save the mapping. labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train_all) nb_classes = len(list(labelencoder.classes_)) with open(pjoin(experiment_root, 'labelencoder.pkl'), 'wb') as pfile: pickle.dump(labelencoder, pfile) # The initial shuffle ensures that the train-val split is randomized # depending on the random seed, and not fixed every time (which would be # very bad). print("Performing initial shuffle...") questions_train_all, answers_train_all, images_train_all = sklearn_shuffle( questions_train_all, answers_train_all, images_train_all) train_all_count = len(questions_train_all) valid_count = int(train_all_count * args.valid_ratio) train_count = train_all_count - valid_count print("We have {0} total Q-A pairs. Will use {1:.2f}% for validation, " "which is {2} data points. {3} data points will be used for " "actual training.".format(train_all_count, args.valid_ratio * 100.0, valid_count, train_count)) questions_train = questions_train_all[:train_count] answers_train = answers_train_all[:train_count] images_train = images_train_all[:train_count] # Note again that this is NOT the official validation set, but just a # fraction (`args.valid_ratio`) of the training set. The full validation # set evaluation is performed separately. questions_valid = questions_train_all[train_count:] answers_valid = answers_train_all[train_count:] images_valid = images_train_all[train_count:] # construct the model final_model, lang_model, img_model = construct_model( args, data_root, experiment_root, nb_classes) model = final_model.model # Compute val error K times per epoch. val_per_epoch = 4 eval_valid_every = int((train_count / args.batch_size) / val_per_epoch) # Perform Tensorboard-friendly dumps. # TODO(andrei): This only works when using Keras's 'fit' method directly. # tensorboard_log_dir = pjoin(experiment_root, 'logs') # tensorboard_cb = keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir, # histogram_freq=0, # write_graph=True, # write_images=False) # The training part starts here print('Training started...') last_valid_loss = 10 for epoch in range(args.num_epochs): epoch_start_ms = int(time.time() * 1000) # shuffle the data points before going through them questions_train, answers_train, images_train = sklearn_shuffle( questions_train, answers_train, images_train) progbar = generic_utils.Progbar(len(questions_train)) batches = batchify(args.batch_size, questions_train, answers_train, images_train) for batch_idx, (qu_batch, an_batch, im_batch) in enumerate(batches): # Extract batch vectors to train on # Converts the answers to their index (we're just doing # classification at this point) y_batch = get_answers_matrix(an_batch, labelencoder) # train on language only or language and image both if args.language_only: x_q_batch = lang_model.process_input(qu_batch) loss = model.train_on_batch(x_q_batch, y_batch) else: x_q_batch = lang_model.process_input(qu_batch) x_i_batch = img_model.process_input(im_batch) loss = model.train_on_batch([x_q_batch, x_i_batch], y_batch) if (batch_idx + 1) % eval_valid_every == 0: # It's time to validate on the held-out part of the training # dataset. batch_val_losses = [] val_batches = batchify(args.batch_size, questions_valid, answers_valid, images_valid) for (qu_val_batch, an_val_batch, im_val_batch) in val_batches: y_val_batch = get_answers_matrix(an_val_batch, labelencoder) if args.language_only: val_loss = model.test_on_batch( lang_model.process_input(qu_val_batch), y_val_batch) else: val_loss = model.test_on_batch([ lang_model.process_input(qu_val_batch), img_model.process_input(im_val_batch) ], y_val_batch) batch_val_losses.append(val_loss) # The validation loss is just the average of the individual # losses computed for each batch of the validation data. last_valid_loss = np.mean(batch_val_losses) # if batch_idx % progress_update_every == 0: # Important: because of retarded reasons, the progress bar # averages these values, so the reported validation loss will # have a bit of lag. progbar.add(args.batch_size, values=[("tra-loss", loss), ("val-loss", last_valid_loss)]) epoch_end_ms = int(time.time() * 1000) epoch_delta_s = (epoch_end_ms - epoch_start_ms) / 1000.0 print("Epoch {0}/{1} took {2:.1f}s.".format( (epoch + 1), args.num_epochs, epoch_delta_s)) print("Latest validation loss: {0:4f}".format(last_valid_loss)) # Dump a checkpoint periodically. if (epoch + 1) % args.model_save_interval == 0: model_dump_fname = pjoin(experiment_root, 'weights_{0}.hdf5'.format(epoch + 1)) print('Saving model to file: {0}'.format(model_dump_fname)) model.save_weights(model_dump_fname) # Compute overall accuracy periodically on OFFICIAL full validation # set (but not too often, as it can get quite slow). if (epoch + 1) % args.model_eval_full_valid_interval == 0: # TODO(andrei): Implement this in a neat way. pass # TODO(Bernhard): catch control+c and store last parameters... # Final checkpoint dump. model.save_weights( pjoin(experiment_root, 'weights_{0}_final.hdf5'.format(epoch + 1)))
def normalize_dataset(dataset): minmax = dataset_minmax(dataset) for row in dataset: for i in range(len(row)): if(isinstance(row[i], str)): break row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0]) sys.path.append('../..') from project_tools import * data_p = os.path.join(os.getcwd(), 'data', 'ecoli.data') df = pd.read_csv(data_p, header=None, delim_whitespace=True, usecols=[2,3,4,5,6,7,8]) n_samples = len(df) print(n_samples) df = sklearn_shuffle(df) dataset = np.array(df) normalize_dataset(dataset) find_same_columns(dataset) max_eucl = 0.0 max_eucl_k = -1 max_manh = 0.0 max_manh_k = -1 for k_size in range(1,n_samples-1): try: acc_euclid = KNN.evaluate_knn(dataset, KNN.euclidean_distance, k_size, 0.1) acc_manh = KNN.evaluate_knn(dataset, KNN.manhettan_dist, k_size, 0.1) print('KNN ACC EUCLID: {} | KNN ACC MANHETTAN: {} | k_size = {}'.format(acc_euclid, acc_manh, k_size)) except IndexError: pass if(acc_euclid > max_eucl):
def decode(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, classifier='bayes', cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None, n_neurons='all', iterations=1, shuffle=False, phase_rand=False): """ Use decoding to classify groups of trials (e.g. stim left/right). Classification is done using the population vector of summed spike counts from the specified time window. Cross-validation is achieved using n-fold cross validation or leave-one-out cross validation. Decoders can decode any number of groups. When providing the classfier with an imbalanced dataset (not the same number of trials in each group) the chance level will not be 1/groups. In that case, to compare the classification performance against change one has to either determine chance level by decoding a shuffled dataset or use the 'auroc' metric as readout (this metric is robust against imbalanced datasets) Parameters ---------- spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings pre_time : float time (in seconds) preceding the event times post_time : float time (in seconds) following the event times classifier : string or sklearn object which decoder to use, either input a scikit learn clf object directly or a string. When it's a string options are (all classifiers are used with default options): 'bayes' Naive Bayes 'forest' Random forest 'regression' Logistic regression 'lda' Linear Discriminant Analysis cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) n_neurons : string or integer number of neurons to randomly subselect from the population (default is 'all') iterations : int number of times to repeat the decoding (especially usefull when subselecting neurons) shuffle : boolean whether to shuffle the trial labels each decoding iteration phase_rand : boolean whether to use phase randomization of the activity over trials to use as a "chance" predictor Returns ------- results : dict dictionary with decoding results accuracy : float accuracy of the classifier in percentage correct f1 : float F1 score of the classifier auroc : float the area under the ROC curve of the classification performance confusion_matrix : 2D array normalized confusion matrix predictions : 2D array with dimensions iterations x trials predicted group label for all trials in every iteration probabilities : 2D array with dimensions iterations x trials classification probability for all trials in every iteration """ # Check input assert classifier in ['bayes', 'forest', 'regression', 'lda'] assert cross_validation in [ 'none', 'kfold', 'leave-one-out', 'block', 'custom' ] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack( ((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = _get_spike_counts_in_bins( spike_times, spike_clusters, times) pop_vector = pop_vector.T # Exclude last trial if the number of trials is even and phase shuffling if (phase_rand is True) & (event_groups.shape[0] % 2 == 0): event_groups = event_groups[:-1] pop_vector = pop_vector[:-1] # Initialize classifier if type(classifier) == str: if classifier == 'forest': clf = RandomForestClassifier() elif classifier == 'bayes': clf = GaussianNB() elif classifier == 'regression': clf = LogisticRegression() elif classifier == 'lda': clf = LinearDiscriminantAnalysis() else: clf = classifier # Pre-allocate variables acc = np.zeros(iterations) f1 = np.zeros(iterations) auroc = np.zeros(iterations) conf_matrix_norm = np.zeros( (np.shape(np.unique(event_groups))[0], np.shape(np.unique(event_groups))[0], iterations)) pred = np.zeros([iterations, pop_vector.shape[0]]) prob = np.zeros([iterations, pop_vector.shape[0]]) for i in range(iterations): # Pre-allocate variables for this iteration y_pred = np.zeros(event_groups.shape) y_probs = np.zeros(event_groups.shape) # Get neurons to use for this iteration if n_neurons == 'all': sub_pop_vector = pop_vector else: use_neurons = np.random.choice(pop_vector.shape[1], n_neurons, replace=False) sub_pop_vector = pop_vector[:, use_neurons] # Shuffle trail labels if necessary if shuffle is True: event_groups = sklearn_shuffle(event_groups) # Perform phase randomization of activity over trials if necessary if phase_rand is True: if i == 0: original_pop_vector = sub_pop_vector rand_pop_vector = np.empty(original_pop_vector.shape) frequencies = int((original_pop_vector.shape[0] - 1) / 2) fsignal = sp.fft.fft(original_pop_vector, axis=0) power = np.abs(fsignal[1:1 + frequencies]) phases = 2 * np.pi * np.random.rand(frequencies) for k in range(original_pop_vector.shape[1]): newfsignal = fsignal[0, k] newfsignal = np.append(newfsignal, np.exp(1j * phases) * power[:, k]) newfsignal = np.append( newfsignal, np.flip(np.exp(-1j * phases) * power[:, k])) newsignal = sp.fft.ifft(newfsignal) rand_pop_vector[:, k] = np.abs(newsignal.real) sub_pop_vector = rand_pop_vector if cross_validation == 'none': # Fit the model on all the data and predict clf.fit(sub_pop_vector, event_groups) y_pred = clf.predict(sub_pop_vector) # Get the probability of the prediction for ROC analysis probs = clf.predict_proba(sub_pop_vector) y_probs = probs[:, 1] # keep positive only else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(sub_pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(sub_pop_vector) elif cross_validation == 'block': block_lengths = [ sum(1 for i in g) for k, g in groupby(prob_left) ] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(sub_pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Fit the model to the training data clf.fit(sub_pop_vector[train_index], event_groups[train_index]) # Predict the test data y_pred[test_index] = clf.predict(sub_pop_vector[test_index]) # Get the probability of the prediction for ROC analysis probs = clf.predict_proba(sub_pop_vector[test_index]) y_probs[test_index] = probs[:, 1] # keep positive only # Calculate performance metrics and confusion matrix acc[i] = accuracy_score(event_groups, y_pred) f1[i] = f1_score(event_groups, y_pred) auroc[i] = roc_auc_score(event_groups, y_probs) conf_matrix = confusion_matrix(event_groups, y_pred) conf_matrix_norm[:, :, i] = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis] # Add prediction and probability to matrix pred[i, :] = y_pred prob[i, :] = y_probs # Make integers from arrays when there's only one iteration if iterations == 1: acc = acc[0] f1 = f1[0] auroc = auroc[0] # Add to results dictionary if cross_validation == 'kfold': results = dict({ 'accuracy': acc, 'f1': f1, 'auroc': auroc, 'predictions': pred, 'probabilities': prob, 'confusion_matrix': conf_matrix_norm, 'n_groups': np.shape(np.unique(event_groups))[0], 'classifier': classifier, 'cross_validation': '%d-fold' % num_splits, 'iterations': iterations, 'shuffle': shuffle }) else: results = dict({ 'accuracy': acc, 'f1': f1, 'auroc': auroc, 'predictions': pred, 'probabilities': prob, 'confusion_matrix': conf_matrix_norm, 'n_groups': np.shape(np.unique(event_groups))[0], 'classifier': classifier, 'cross_validation': cross_validation, 'iterations': iterations, 'shuffle': shuffle }) return results
def _extract_training_data(self, training_data): X, y = training_data.X, training_data.y if self.shuffle: X, y = sklearn_shuffle(X, y) return X, y
def fit(self, x, y, batch_size=1024, epochs=50, validation_split=0.0, validation_data=None, val_size=2**18, shuffle=True, initial_epoch=0, min_display=50, max_iter=-1): if validation_split < 0 or validation_split >= 1: raise ValueError( "validation_split must be a float number >= 0 and < 1") n_samples = x.shape[0] iters = (n_samples - 1) // batch_size + 1 self.tr_loss_list = [] self.val_loss_list = [] print(iters, "steps per epoch") print(batch_size, "samples per step") start_time = time.time() stop_flag = False self.best_loss = np.inf self.best_ckpt = None if not validation_data and validation_split > 0: x, val_x, y, val_y = train_test_split(x, y, test_size=validation_split, random_state=self.seed) validation_data = [(val_x, val_y)] for i in range(epochs): if i < initial_epoch: continue if shuffle: x, y = sklearn_shuffle(x, y, random_state=self.seed) for j in range(iters): batch_x = x[j * batch_size:(j + 1) * batch_size] batch_y = y[j * batch_size:(j + 1) * batch_size] self.train_on_batch(batch_x, batch_y) if j % min_display == 0: tr_loss = self.evaluate(x, y, val_size) self.tr_loss_list.append(tr_loss) total_time = time.time() - start_time if validation_data is None: print( "Epoch {0: 2d} Step {1: 4d}: tr_loss {2: 0.6f} tr_time {3: 0.1f}" .format(i, j, tr_loss, total_time)) else: val_loss = self.evaluate(validation_data[0][0], validation_data[0][1], val_size) self.val_loss_list.append(val_loss) print( "Epoch {0: 2d} Step {1: 4d}: tr_loss {2: 0.6f} va_loss {3: 0.6f} tr_time {4: 0.1f}" .format(i, j, tr_loss, val_loss, total_time)) if val_loss < self.best_loss: self.best_loss = val_loss # self.save_model(self.checkpoint_path+'best') # self.save_model(self.checkpoint_path) if (i * iters) + j == max_iter: stop_flag = True break if stop_flag: break
# Check if there are enough neurons in this brain region if np.unique(clus_region).shape[0] < MIN_NEURONS: continue # Get population response matrix of all trials dlc_matrix = [] ## INPUT MATRIX HERE # Decode accuracy = classify(dlc_matric, trial_ids, clf, cv)[0] null_iterations = np.empty(ITERATIONS) for k in range(ITERATIONS): # Estimate chance level if CHANCE_LEVEL == 'shuffle': null_iterations[k] = classify(dlc_matrix, sklearn_shuffle(trial_ids), clf, cv)[0] elif CHANCE_LEVEL == 'pseudo-session': pseudo_trials = generate_pseudo_session(trials) pseudo_incl = (pseudo_trials.probabilityLeft == 0.8) | (pseudo_trials.probabilityLeft == 0.2) trial_times = pseudo_trials.stimOn_times[pseudo_incl] probability_left = pseudo_trials.probabilityLeft[pseudo_incl] pseudo_trial_ids = (pseudo_trials.probabilityLeft[pseudo_incl] == 0.2).astype(int) null_iterations[k] = classify(dlc_matrix, pseudo_trial_ids, clf, cv)[0] elif CHANCE_LEVEL == 'none': null_iterations = [] else: raise Exception('CHANCE_LEVEL must be phase_rand, shuffle or none') # Calculate p-value p_value = np.sum(null_iterations > accuracy) / null_iterations.shape[0]
def on_epoch_end(self): # Shuffles indices after every epoch self.indexes = np.arange(len(self.filenames), dtype=int) if self.shuffle: self.indexes = sklearn_shuffle(self.indexes, random_state=self.seed)