Beispiel #1
0
def get_data(class_type):
    file_list = os.listdir(f'./data/images/syn/{class_type}/true')
    if '.ipynb_checkpoints' in file_list:
        idx = file_list.index('.ipynb_checkpoints')
        file_list.pop(idx)

    true_paths = [
        f'./data/images/syn/{class_type}/true/{file}' for file in file_list
    ]
    true_labels = [1 for i in range(len(true_paths))]

    file_list = os.listdir(f'./data/images/syn/{class_type}/false')
    if '.ipynb_checkpoints' in file_list:
        idx = file_list.index('.ipynb_checkpoints')
        file_list.pop(idx)

    false_paths = [
        f'./data/images/syn/{class_type}/false/{file}' for file in file_list
    ]
    false_labels = [0 for i in range(len(false_paths))]

    labels = np.array(true_labels + false_labels)
    print(f'{class_type.upper()} Value Counts')
    print(pd.Series(labels).value_counts())
    paths = np.array(true_paths + false_paths)
    #labels = to_categorical(labels)
    if class_type == 'train':
        paths, labels = sklearn_shuffle(paths, labels)
    return paths, labels
Beispiel #2
0
    def __init__(self, encoder_inp_data, decoder_inp_data, target_data,
                 batch_size):
        self.batch_size = batch_size
        # shuffle the data
        shuffled_enc_inp, shuffled_dec_inp, shuffled_target = sklearn_shuffle(
            encoder_inp_data, decoder_inp_data, target_data)

        # % 80 train , %20 train
        self.train_length = len(shuffled_enc_inp) * 80 // 100

        #print("train_length : ", self.train_length)

        self.train_enc_inp = shuffled_enc_inp[:self.train_length]
        self.test_enc_inp = shuffled_enc_inp[self.train_length:]

        self.train_dec_inp = shuffled_dec_inp[:self.train_length]
        self.test_dec_inp = shuffled_dec_inp[self.train_length:]

        self.train_target = shuffled_target[:self.train_length]
        self.test_target = shuffled_target[self.train_length:]

        #self.sequence_length = slef.create_sequence_length(self.train_enc_inp, self.batch_size)

        # iterator
        self.iter = 0
        self.test = 0
Beispiel #3
0
    def train(
        self,
        training_trackers: List[TrackerWithCachedStates],
        domain: Domain,
        interpreter: NaturalLanguageInterpreter,
        **kwargs: Any,
    ) -> None:
        tracker_state_features, label_ids = self.featurize_for_training(
            training_trackers, domain, interpreter, **kwargs)
        training_data, zero_state_features = model_data_utils.convert_to_data_format(
            tracker_state_features)
        self.zero_state_features = zero_state_features

        self._train_params.update(kwargs)
        model = self.model_architecture(**self._train_params)
        score = None
        # Note: clone is called throughout to avoid mutating default arguments.
        self.label_encoder = clone(self.label_encoder).fit(label_ids)
        X = self._preprocess_data(training_data)
        y = self.label_encoder.transform(label_ids)

        if self.shuffle:
            X, y = sklearn_shuffle(X, y)

        if self.cv is None:
            model = clone(model).fit(X, y)
        else:
            param_grid = self.param_grid or {}
            model, score = self._search_and_score(model, X, y, param_grid)

        self.model = model
        logger.info("Done fitting sklearn policy model")
        if score is not None:
            logger.info(f"Cross validation score: {score:.5f}")
Beispiel #4
0
    def __init__(self,
                 datasets,
                 labels,
                 hdf5_file=None,
                 batch_size=32,
                 dim=(128, 128),
                 shuffle=True,
                 subset=None,
                 random_seed=42):
        """
		Initialize generator.

		Parameters:
			datasets (list): List of file paths to the NPZ files containing the images created
				using :func:`generate_train_images`.
			labels (list): List of integer labels corresponding to the images in ``filenames``.
			hdf5_file (h5py.File): Opened HDF5 file containing images as datasets.
			batch_size (int, optional): Batch size.
			dim (tuple, optional): Image/2D array dimensions.
			shuffle (bool, optional): Shuffle data after every epoch?
			subset (str, optional):
			random_seed (int, optional): Random seed for splitting and shuffeling.
		"""
        self.batch_size = batch_size
        self.filenames = datasets  # Yes, we know this is badly named!
        self.labels = labels  # for binary classification
        self.shuffle = shuffle  # shuffles data after every epoch
        self.dim = dim  # image/2D array dimensions
        self.seed = random_seed  # Add random seed to ensure split for validation and training set is the same

        # Number of unique labels:
        self.num_classes = len(np.unique(labels))

        # Open the HDF5 file in read-only mode:
        if isinstance(hdf5_file, h5py.File):
            self.hdf = hdf5_file
        else:
            self.hdf = h5py.File(hdf5_file, 'r')

        # Create list if indicies and optionally shuffle them:
        self.indexes = np.arange(len(self.filenames), dtype=int)
        if shuffle:
            self.indexes = sklearn_shuffle(self.indexes,
                                           random_state=self.seed)

        # This is a hacky way to do it, but the only way under the current framework
        if subset is not None:
            train_indices, valid_indices = train_test_split(
                self.indexes,
                test_size=0.2,  # FIXME: Should the be allowed to change?
                stratify=labels,
                random_state=self.seed)

            if subset == 'train':
                self.indexes = train_indices
            elif subset == 'valid':
                self.indexes = valid_indices
            else:
                raise ValueError("subset keyword not set properly")
Beispiel #5
0
 def _extract_training_data(
         self, training_data: DialogueTrainingData
 ) -> Tuple[np.ndarray, np.ndarray]:
     # transform y from one-hot to num_classes
     X, y = training_data.X, training_data.y.argmax(axis=-1)
     if self.shuffle:
         X, y = sklearn_shuffle(X, y)
     return X, y
Beispiel #6
0
    def split_into_batches(self, X, y, size=16, shuffle=True):
        if shuffle:
            X, y = sklearn_shuffle(X, y, random_state=0)

        n, m = X.shape
        n_batches = ceil(n / size)

        X_b = [X[i * size: (i + 1) * size, :] for i in range(n_batches)]
        y_b = [y[i * size: (i + 1) * size] for i in range(n_batches)]

        return X_b, y_b
Beispiel #7
0
def create_data_generators(train_dir, val_dir, image_shape, batch_size):
    train_datagen = ImageDataGenerator(rotation_range=30,
                                       width_shift_range=0.125,
                                       height_shift_range=0.125,
                                       depth_shift_range=0.125,
                                       zoom_range=0.125,
                                       horizontal_flip=True,
                                       vertical_flip=False,
                                       depth_flip=False)
    val_datagen = ImageDataGenerator()

    train_neg_files = list(get_data_files(os.path.join(train_dir, '0')))
    train_pos_files = list(get_data_files(os.path.join(train_dir, '1')))
    total = len(train_neg_files) + len(train_pos_files)
    X_train = np.zeros((total, *image_shape))
    for i, f in enumerate(train_neg_files + train_pos_files):
        patch = np.load(f)
        X_train[i, :, :, :] = patch
    y_train = np.hstack((np.zeros(len(train_neg_files)),
                         np.ones(len(train_pos_files)))).astype(np.bool)
    X_train, y_train = sklearn_shuffle(X_train, y_train)

    val_neg_files = list(get_data_files(os.path.join(val_dir, '0')))
    val_pos_files = list(get_data_files(os.path.join(val_dir, '1')))
    total = len(val_neg_files) + len(val_pos_files)
    X_val = np.zeros((total, *image_shape))
    for i, f in enumerate(val_neg_files + val_pos_files):
        patch = np.load(f)
        X_val[i, :, :, :] = patch
    y_val = np.hstack((np.zeros(len(val_neg_files)),
                       np.ones(len(val_pos_files)))).astype(np.bool)
    X_val, y_val = sklearn_shuffle(X_val, y_val)

    train_generator = train_datagen.flow(X_train,
                                         y_train,
                                         batch_size=batch_size)
    val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size)

    return train_generator, val_generator
 def _get_iterator(self, subfolder, epochs, batch_size, preprocessing_function, shuffle):
     filenames, labels = self._get_files_and_labels(subfolder)
     if shuffle:
         filenames, labels = sklearn_shuffle(filenames, labels)
     dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
     dataset = dataset.repeat(epochs)
     dataset = dataset.map(_parse_function, num_parallel_calls=20)
     if preprocessing_function:
         dataset = dataset.map(preprocessing_function, num_parallel_calls=20)
     if batch_size:
         dataset = dataset.batch(batch_size)
     dataset = dataset.prefetch(10)
     iterator = dataset.make_initializable_iterator()
     return iterator
Beispiel #9
0
    def load_most_rated_movies(self, n_movies, shuffle=False):
        """
        Finds the n most rated movies.

        :param n_movies:    number of movies
        :param shuffle:     if True then result is shuffled
        :return:            the n most rated movies, pd dataframe
        """
        movies_df = self.load_movies()

        if n_movies <= 0 or n_movies > len(movies_df):
            raise RuntimeError("Invalid number of movies requested")

        most_rated_movie_ids = self.__load_most_rated_movie_ids(n_movies)
        most_rated_movies = movies_df.loc[movies_df["movieId"].isin(
            most_rated_movie_ids)].set_index("movieId")

        if shuffle:
            most_rated_movies = sklearn_shuffle(most_rated_movies)
        return most_rated_movies
Beispiel #10
0
def train(model,
          trainset_csr_pkl_path,
          labels_pkl_path=None,
          testset_csr_pkl_path=None,
          n_epoch=5,
          batch_size=256,
          train_set_percent=0.75,
          should_split_by_field=False,
          field_sizes_pkl_path=None,
          should_early_stop=True,
          early_stop_interval=10,
          batch_eval_interval=-1,
          should_dump_model=False,
          model_dump_path="",
          shuffle_trainset=True,
          eval_interval=1,
          train_log_path="",
          ctr_or_recommend=True,
          predict_batch_size=10000,
          min_rec_pred=1,
          max_rec_pred=5,
          **kwargs):
    util.log.log("Start to train model")
    util.log.log("Loading trainset and labels")
    if testset_csr_pkl_path is None:
        dataset = joblib.load(trainset_csr_pkl_path)
        labels = pd.read_csv(labels_pkl_path, header=None)
        train_set_size = int(train_set_percent * labels.shape[0])
        util.log.log("Start to split trainset and testset")
        if not isinstance(dataset, list):
            train_set = dataset[:train_set_size]
            test_set = dataset[train_set_size:]
        else:
            train_set = [field[:train_set_size] for field in dataset]
            test_set = [field[train_set_size:] for field in dataset]
        util.log.log("Start to split trainset and testset labels")
        train_labels = labels[:train_set_size]

        test_labels = labels[train_set_size:]
        if not ctr_or_recommend:
            train_labels = np.clip(train_labels, min_rec_pred, max_rec_pred)
            test_labels = np.clip(test_labels, min_rec_pred, max_rec_pred)
        train_data = (train_set, train_labels)
        test_data = (test_set, test_labels)
    else:
        train_data = joblib.load(trainset_csr_pkl_path)
        test_data = joblib.load(testset_csr_pkl_path)
    util.log.log("Handling field size")
    field_sizes = joblib.load(field_sizes_pkl_path) \
        if field_sizes_pkl_path is not None else None
    if field_sizes is not None:
        if should_split_by_field and not isinstance(train_data[0], list):
            field_idxs = util.preprocess.get_field_idxs_from_field_size(
                field_sizes)
            util.log.log("Spliting Data by field")
            train_data = util.train.split_data_by_field(train_data, field_idxs)
            test_data = util.train.split_data_by_field(test_data, field_idxs)

    history_infos = []
    history_eval_scores = []
    best_eval_score = -1
    train_score = 999
    test_score = 999
    best_batch_eval_score = -1
    for i in xrange(n_epoch):
        util.log.log("Train in epoch %d" % i)
        fetches = [model.optimizer, model.loss]
        losses = []
        if batch_size > 0:
            losses = []
            inst_size = train_data[0].shape[0] \
                if not isinstance(train_data[0], list) else train_data[0][0].shape[0]
            n_iter = inst_size / batch_size
            if n_iter != float(inst_size) / batch_size:
                n_iter = n_iter + 1
            if shuffle_trainset:
                shuffle_idxs = sklearn_shuffle(range(n_iter))
            for j in xrange(n_iter):
                if j % 10000 == 0:
                    util.log.log("Train in epoch %d iter %d" % (i, j))
                idx = j
                if shuffle_trainset:
                    idx = shuffle_idxs[j]
                X, y = util.train.slice(
                    train_data, idx * batch_size,
                    min(batch_size, inst_size - idx * batch_size))
                _, loss = model.run(fetches, X, y)
                if batch_eval_interval > 0 and j % batch_eval_interval == batch_eval_interval / 2:
                    train_preds = predict(model, train_data,
                                          predict_batch_size)
                    test_preds = predict(model, test_data, predict_batch_size)
                    if ctr_or_recommend:
                        train_score = roc_auc_score(train_data[1], train_preds)
                        test_score = roc_auc_score(test_data[1], test_preds)
                        if best_batch_eval_score == -1 or test_score < best_batch_eval_score:
                            best_batch_eval_score = test_score
                        train_loss = log_loss(train_data[1], train_preds)
                        test_loss = log_loss(test_data[1], test_preds)
                        util.log.log(
                            "[%d-%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"
                            % (i, j, np.mean(losses), train_score, test_score,
                               train_loss, test_loss, best_batch_eval_score))
                        print "[%d-%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"\
                              %(i, j, np.mean(losses), train_score, test_score, train_loss, test_loss, best_batch_eval_score)
                    else:
                        train_preds = np.clip(train_preds, min_rec_pred,
                                              max_rec_pred)
                        test_preds = np.clip(test_preds, min_rec_pred,
                                             max_rec_pred)
                        train_score = np.sqrt(
                            mean_squared_error(train_data[1], train_preds))
                        test_score = np.sqrt(
                            mean_squared_error(test_data[1], test_preds))
                        if best_batch_eval_score == -1 or test_score < best_batch_eval_score:
                            best_batch_eval_score = test_score
                        util.log.log(
                            "[%d-%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"
                            % (i, j, np.mean(losses), train_score, test_score,
                               best_batch_eval_score))
                        print "[%d-%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"\
                              %(i, j, np.mean(losses), train_score, test_score, best_batch_eval_score)

                losses.append(loss)
        elif batch_size == -1:
            X, y = util.train.slice(train_data)
            _, loss = model.run(fetches, X, y)
            losses = [loss]
        if (i + 1) % eval_interval == 0:
            util.log.log("Evaluate in epoch %d" % i)
            train_preds = predict(model, train_data, predict_batch_size)
            util.log.log("Predict Test Set")
            test_preds = predict(model, test_data, predict_batch_size)
            util.log.log("Cal Evaluation")
            if ctr_or_recommend:
                train_score = roc_auc_score(train_data[1], train_preds)
                test_score = roc_auc_score(test_data[1], test_preds)
                if best_eval_score == -1 or test_score < best_eval_score:
                    best_eval_score = test_score
                train_loss = log_loss(train_data[1], train_preds)
                test_loss = log_loss(test_data[1], test_preds)
                util.log.log(
                    "[%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"
                    % (i, np.mean(losses), train_score, test_score, train_loss,
                       test_loss, best_eval_score))
                print "[%d]\tavg-loss:%f\ttrain-auc:%f\teval-auc:%f\ttrain-loss:%f\teval-loss:%f\tmin-eval-auc:%f"\
                      %(i, np.mean(losses), train_score, test_score, train_loss, test_loss, best_eval_score)
            else:
                train_preds = np.clip(train_preds, min_rec_pred, max_rec_pred)
                test_preds = np.clip(test_preds, min_rec_pred, max_rec_pred)
                train_score = np.sqrt(
                    mean_squared_error(train_data[1], train_preds))
                test_score = np.sqrt(
                    mean_squared_error(test_data[1], test_preds))
                if best_eval_score == -1 or test_score < best_eval_score:
                    best_eval_score = test_score
                util.log.log(
                    "[%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"
                    % (i, np.mean(losses), train_score, test_score,
                       best_eval_score))
                print "[%d]\tavg-loss:%f\ttrain-rmse:%f\teval-rmse:%f\tmin-eval-rmse:%f"\
                      %(i, np.mean(losses), train_score, test_score, best_eval_score)
        else:
            if ctr_or_recommend:
                train_score = -1
                test_score = -1
                train_loss = -1
                test_loss = -1
            else:
                pass
        if ctr_or_recommend:
            history_infos.append({
                "losses": losses,
                "avg-loss": np.mean(losses),
                "train-auc": train_score,
                "test-auc": test_score,
                "train-loss": train_loss,
                "test-loss": test_loss
            })
        else:
            history_infos.append({
                "losses": losses,
                "avg-loss": np.mean(losses),
                "train-rmse": train_score,
                "test-rmse": test_score,
            })
        history_eval_scores.append(test_score)
        if ctr_or_recommend:
            best_test_auc_epoch = np.argmax(history_eval_scores)
        else:
            best_test_auc_epoch = np.argmin(history_eval_scores)
        if should_early_stop and i - best_test_auc_epoch >= early_stop_interval:
            print "Early stop\nbest iteration:\n[%d]\teval-auc: %f" % (
                best_test_auc_epoch, history_eval_scores[best_test_auc_epoch])
            break
    if should_dump_model:
        model.dump(model_dump_path)
    if len(train_log_path) != 0:
        json_log = {
            "conf": kwargs,
            "eval_log": history_infos,
            "best_eval_score": best_eval_score,
            "best_batch_eval_score": best_batch_eval_score
        }

        param_str = ""
        if kwargs['model_name'] == "biasedMF":
            param_str += "." + str(kwargs['model_params']['embd_size'])
            param_str += "." + str(
                kwargs['model_params']['learning_rate']).replace('.', 'p')
            param_str += "." + str(kwargs['model_params']['reg_rate']).replace(
                '.', 'p')
        else:
            param_str += "." + "_".join(
                [str(l) for l in kwargs['model_params']['layer_sizes'][1:]])
            param_str += "." + str(kwargs['model_params']['layer_acts'][2])
            param_str += "." + str(
                kwargs['model_params']['learning_rate']).replace('.', 'p')
            param_str += "." + str(
                kwargs['model_params']['kernel_l2']).replace('.', 'p')
            param_str += "." + str(
                kwargs['model_params']['layer_keeps'][1]).replace('.', 'p')

        param_str += "." + str(trainset_csr_pkl_path.split('/')[2])
        if field_sizes is None:
            param_str += "." + str(1)
        else:
            param_str += "." + str(len(field_sizes))
        if not trainset_csr_pkl_path.endswith(".pkl"):
            param_str += "." + trainset_csr_pkl_path[-5:].replace('.', 'p')
        train_log_path += param_str
        train_log_path += "." + str(
            min(
                best_eval_score, best_batch_eval_score if best_batch_eval_score
                != -1 else best_eval_score)).replace('.', 'p')
        train_log_path += "." + str(test_score).replace('.', 'p')
        fo = open(train_log_path, "w")
        json.dump(json_log,
                  fo,
                  indent=True,
                  default=util.json_util.json_numpy_serialzer)
        fo.close()
        util.log.log("log json in %s" % train_log_path)
    return model
 def _extract_training_data(self, training_data):
     # transform y from one-hot to num_classes
     X, y = training_data.X, training_data.y.argmax(axis=-1)
     if self.shuffle:
         X, y = sklearn_shuffle(X, y)
     return X, y
                # Decode with subselecting neurons
                decode_subselects[j] = classify(
                    pop_vector[np.ix_(incl_trials,
                                      np.isin(cluster_ids, use_neurons))],
                    trial_ids, clf, cv)[0]

                null_iterations = np.empty(ITERATIONS)
                for k in range(ITERATIONS):
                    # Estimate chance level
                    if CHANCE_LEVEL == 'shuffle':
                        null_iterations[k] = classify(
                            pop_vector[np.ix_(
                                incl_trials, np.isin(cluster_ids,
                                                     use_neurons))],
                            sklearn_shuffle(trial_ids), clf, cv)[0]
                    elif CHANCE_LEVEL == 'pseudo-session':
                        pseudo_trials = generate_pseudo_session(trials)
                        _, pseudo_incl_trials, pseudo_trial_ids = trial_vectors(
                            pseudo_trials, TARGET)
                        null_iterations[k] = classify(
                            pop_vector[np.ix_(
                                pseudo_incl_trials,
                                np.isin(cluster_ids, use_neurons))],
                            pseudo_trial_ids, clf, cv)[0]
                    elif CHANCE_LEVEL == 'none':
                        null_iterations = []
                    else:
                        raise Exception(
                            'CHANCE_LEVEL must be phase_rand, shuffle or none')
Beispiel #13
0
 def _shuffle(self):
     self.X_u, self.y_u=self.rus.fit_resample(self.X,self.y)
     self.X_u,self.y_u = sklearn_shuffle(self.X_u,self.y_u)
        # Decode
        if VALIDATION[6:] == 'interleaved':
            y_pred, y_probs = decode(this_pop_vector, trial_ids, NUM_SPLITS,
                                     True)
        else:
            y_pred, y_probs = decode(this_pop_vector, trial_ids, NUM_SPLITS,
                                     False)
        accuracy[k] = accuracy_score(trial_ids, y_pred)
        pred[k, :] = y_pred
        prob[k, :] = y_probs

        # Decode shuffled data
        if VALIDATION[6:] == 'interleaved':
            y_pred, y_probs = decode(this_pop_vector,
                                     sklearn_shuffle(trial_ids), NUM_SPLITS,
                                     True)
        else:
            y_pred, y_probs = decode(this_pop_vector,
                                     sklearn_shuffle(trial_ids), NUM_SPLITS,
                                     False)
        accuracy_shuffle[k] = accuracy_score(trial_ids, y_pred)
        pred_shuffle[k, :] = y_pred
        prob_shuffle[k, :] = y_probs

    # Get average probability per trial
    unique_trial_numbers = np.unique(trial_numbers)
    prob_per_trial = np.empty(unique_trial_numbers.shape[0])
    for t, trial in enumerate(unique_trial_numbers):
        prob_per_trial[t] = np.mean(prob[np.where(trial_numbers == trial)])
Beispiel #15
0
def main():
    args = parse_arguments()

    data_root = args.dataroot
    experiment_root = args.experiment_root

    # Set both the numpy and the Python random seeds.
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)

    # Load data needed for training and save all parameters/mappings to make
    # sure experiments are reproducible
    questions_train_all, answers_train_all, images_train_all = load_train_data(
        data_root)

    # Since we are simplifying the problem of Visual QA to a classification
    # problem in this baseline, we want to limit the number of possible
    # answers, and have the model simply pick the most appropriate one.
    max_answers = 1000
    questions_train_all, answers_train_all, images_train_all = \
        select_frequent_answers(questions_train_all, answers_train_all,
                                images_train_all, max_answers)

    # Encode the remaining (top max_answers) answers and save the mapping.
    labelencoder = preprocessing.LabelEncoder()
    labelencoder.fit(answers_train_all)
    nb_classes = len(list(labelencoder.classes_))
    with open(pjoin(experiment_root, 'labelencoder.pkl'), 'wb') as pfile:
        pickle.dump(labelencoder, pfile)

    # The initial shuffle ensures that the train-val split is randomized
    # depending on the random seed, and not fixed every time (which would be
    # very bad).
    print("Performing initial shuffle...")
    questions_train_all, answers_train_all, images_train_all = sklearn_shuffle(
        questions_train_all, answers_train_all, images_train_all)

    train_all_count = len(questions_train_all)
    valid_count = int(train_all_count * args.valid_ratio)
    train_count = train_all_count - valid_count

    print("We have {0} total Q-A pairs. Will use {1:.2f}% for validation, "
          "which is {2} data points. {3} data points will be used for "
          "actual training.".format(train_all_count, args.valid_ratio * 100.0,
                                    valid_count, train_count))

    questions_train = questions_train_all[:train_count]
    answers_train = answers_train_all[:train_count]
    images_train = images_train_all[:train_count]
    # Note again that this is NOT the official validation set, but just a
    # fraction (`args.valid_ratio`) of the training set. The full validation
    # set evaluation is performed separately.
    questions_valid = questions_train_all[train_count:]
    answers_valid = answers_train_all[train_count:]
    images_valid = images_train_all[train_count:]

    # construct the model
    final_model, lang_model, img_model = construct_model(
        args, data_root, experiment_root, nb_classes)
    model = final_model.model

    # Compute val error K times per epoch.
    val_per_epoch = 4
    eval_valid_every = int((train_count / args.batch_size) / val_per_epoch)

    # Perform Tensorboard-friendly dumps.
    # TODO(andrei): This only works when using Keras's 'fit' method directly.
    # tensorboard_log_dir = pjoin(experiment_root, 'logs')
    # tensorboard_cb = keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir,
    #                                              histogram_freq=0,
    #                                              write_graph=True,
    #                                              write_images=False)

    # The training part starts here
    print('Training started...')
    last_valid_loss = 10
    for epoch in range(args.num_epochs):
        epoch_start_ms = int(time.time() * 1000)
        # shuffle the data points before going through them
        questions_train, answers_train, images_train = sklearn_shuffle(
            questions_train, answers_train, images_train)
        progbar = generic_utils.Progbar(len(questions_train))
        batches = batchify(args.batch_size, questions_train, answers_train,
                           images_train)
        for batch_idx, (qu_batch, an_batch, im_batch) in enumerate(batches):
            # Extract batch vectors to train on
            # Converts the answers to their index (we're just doing
            # classification at this point)
            y_batch = get_answers_matrix(an_batch, labelencoder)

            # train on language only or language and image both
            if args.language_only:
                x_q_batch = lang_model.process_input(qu_batch)
                loss = model.train_on_batch(x_q_batch, y_batch)
            else:
                x_q_batch = lang_model.process_input(qu_batch)
                x_i_batch = img_model.process_input(im_batch)
                loss = model.train_on_batch([x_q_batch, x_i_batch], y_batch)

            if (batch_idx + 1) % eval_valid_every == 0:
                # It's time to validate on the held-out part of the training
                # dataset.
                batch_val_losses = []
                val_batches = batchify(args.batch_size, questions_valid,
                                       answers_valid, images_valid)
                for (qu_val_batch, an_val_batch, im_val_batch) in val_batches:
                    y_val_batch = get_answers_matrix(an_val_batch,
                                                     labelencoder)
                    if args.language_only:
                        val_loss = model.test_on_batch(
                            lang_model.process_input(qu_val_batch),
                            y_val_batch)
                    else:
                        val_loss = model.test_on_batch([
                            lang_model.process_input(qu_val_batch),
                            img_model.process_input(im_val_batch)
                        ], y_val_batch)

                    batch_val_losses.append(val_loss)

                # The validation loss is just the average of the individual
                # losses computed for each batch of the validation data.
                last_valid_loss = np.mean(batch_val_losses)

            # if batch_idx % progress_update_every == 0:
            # Important: because of retarded reasons, the progress bar
            # averages these values, so the reported validation loss will
            # have a bit of lag.
            progbar.add(args.batch_size,
                        values=[("tra-loss", loss),
                                ("val-loss", last_valid_loss)])

        epoch_end_ms = int(time.time() * 1000)
        epoch_delta_s = (epoch_end_ms - epoch_start_ms) / 1000.0
        print("Epoch {0}/{1} took {2:.1f}s.".format(
            (epoch + 1), args.num_epochs, epoch_delta_s))
        print("Latest validation loss: {0:4f}".format(last_valid_loss))

        # Dump a checkpoint periodically.
        if (epoch + 1) % args.model_save_interval == 0:
            model_dump_fname = pjoin(experiment_root,
                                     'weights_{0}.hdf5'.format(epoch + 1))
            print('Saving model to file: {0}'.format(model_dump_fname))
            model.save_weights(model_dump_fname)

        # Compute overall accuracy periodically on OFFICIAL full validation
        # set (but not too often, as it can get quite slow).
        if (epoch + 1) % args.model_eval_full_valid_interval == 0:
            # TODO(andrei): Implement this in a neat way.
            pass

    # TODO(Bernhard): catch control+c and store last parameters...
    # Final checkpoint dump.
    model.save_weights(
        pjoin(experiment_root, 'weights_{0}_final.hdf5'.format(epoch + 1)))
Beispiel #16
0
    
def normalize_dataset(dataset):
    minmax = dataset_minmax(dataset)
    for row in dataset:
        for i in range(len(row)):
            if(isinstance(row[i], str)):
                break
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

sys.path.append('../..')
from project_tools import *
data_p = os.path.join(os.getcwd(), 'data', 'ecoli.data')
df = pd.read_csv(data_p, header=None, delim_whitespace=True, usecols=[2,3,4,5,6,7,8])
n_samples = len(df)
print(n_samples)
df = sklearn_shuffle(df)
dataset = np.array(df)
normalize_dataset(dataset)
find_same_columns(dataset)
max_eucl = 0.0
max_eucl_k = -1
max_manh = 0.0
max_manh_k = -1
for k_size in range(1,n_samples-1):
    try:
        acc_euclid = KNN.evaluate_knn(dataset, KNN.euclidean_distance, k_size, 0.1)
        acc_manh = KNN.evaluate_knn(dataset, KNN.manhettan_dist, k_size, 0.1)
        print('KNN ACC EUCLID: {} | KNN ACC MANHETTAN: {} | k_size = {}'.format(acc_euclid, acc_manh, k_size))
    except IndexError:
        pass
    if(acc_euclid > max_eucl):
Beispiel #17
0
def decode(spike_times,
           spike_clusters,
           event_times,
           event_groups,
           pre_time=0,
           post_time=0.5,
           classifier='bayes',
           cross_validation='kfold',
           num_splits=5,
           prob_left=None,
           custom_validation=None,
           n_neurons='all',
           iterations=1,
           shuffle=False,
           phase_rand=False):
    """
    Use decoding to classify groups of trials (e.g. stim left/right). Classification is done using
    the population vector of summed spike counts from the specified time window. Cross-validation
    is achieved using n-fold cross validation or leave-one-out cross validation. Decoders can
    decode any number of groups. When providing the classfier with an imbalanced dataset (not
    the same number of trials in each group) the chance level will not be 1/groups. In that case,
    to compare the classification performance against change one has to either determine chance
    level by decoding a shuffled dataset or use the 'auroc' metric as readout (this metric is
    robust against imbalanced datasets)

    Parameters
    ----------
    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    pre_time : float
        time (in seconds) preceding the event times
    post_time : float
        time (in seconds) following the event times
    classifier : string or sklearn object
        which decoder to use, either input a scikit learn clf object directly or a string.
        When it's a string options are (all classifiers are used with default options):
            'bayes'         Naive Bayes
            'forest'        Random forest
            'regression'    Logistic regression
            'lda'           Linear Discriminant Analysis
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)
    n_neurons : string or integer
        number of neurons to randomly subselect from the population (default is 'all')
    iterations : int
        number of times to repeat the decoding (especially usefull when subselecting neurons)
    shuffle : boolean
        whether to shuffle the trial labels each decoding iteration
    phase_rand : boolean
        whether to use phase randomization of the activity over trials to use as a "chance"
        predictor

    Returns
    -------
    results : dict
        dictionary with decoding results

        accuracy : float
            accuracy of the classifier in percentage correct
        f1 : float
            F1 score of the classifier
        auroc : float
            the area under the ROC curve of the classification performance
        confusion_matrix : 2D array
            normalized confusion matrix
        predictions : 2D array with dimensions iterations x trials
            predicted group label for all trials in every iteration
        probabilities : 2D array with dimensions iterations x trials
            classification probability for all trials in every iteration
    """

    # Check input
    assert classifier in ['bayes', 'forest', 'regression', 'lda']
    assert cross_validation in [
        'none', 'kfold', 'leave-one-out', 'block', 'custom'
    ]
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(
        ((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = _get_spike_counts_in_bins(
        spike_times, spike_clusters, times)
    pop_vector = pop_vector.T

    # Exclude last trial if the number of trials is even and phase shuffling
    if (phase_rand is True) & (event_groups.shape[0] % 2 == 0):
        event_groups = event_groups[:-1]
        pop_vector = pop_vector[:-1]

    # Initialize classifier
    if type(classifier) == str:
        if classifier == 'forest':
            clf = RandomForestClassifier()
        elif classifier == 'bayes':
            clf = GaussianNB()
        elif classifier == 'regression':
            clf = LogisticRegression()
        elif classifier == 'lda':
            clf = LinearDiscriminantAnalysis()
    else:
        clf = classifier

    # Pre-allocate variables
    acc = np.zeros(iterations)
    f1 = np.zeros(iterations)
    auroc = np.zeros(iterations)
    conf_matrix_norm = np.zeros(
        (np.shape(np.unique(event_groups))[0],
         np.shape(np.unique(event_groups))[0], iterations))
    pred = np.zeros([iterations, pop_vector.shape[0]])
    prob = np.zeros([iterations, pop_vector.shape[0]])

    for i in range(iterations):

        # Pre-allocate variables for this iteration
        y_pred = np.zeros(event_groups.shape)
        y_probs = np.zeros(event_groups.shape)

        # Get neurons to use for this iteration
        if n_neurons == 'all':
            sub_pop_vector = pop_vector
        else:
            use_neurons = np.random.choice(pop_vector.shape[1],
                                           n_neurons,
                                           replace=False)
            sub_pop_vector = pop_vector[:, use_neurons]

        # Shuffle trail labels if necessary
        if shuffle is True:
            event_groups = sklearn_shuffle(event_groups)

        # Perform phase randomization of activity over trials if necessary
        if phase_rand is True:
            if i == 0:
                original_pop_vector = sub_pop_vector
            rand_pop_vector = np.empty(original_pop_vector.shape)
            frequencies = int((original_pop_vector.shape[0] - 1) / 2)
            fsignal = sp.fft.fft(original_pop_vector, axis=0)
            power = np.abs(fsignal[1:1 + frequencies])
            phases = 2 * np.pi * np.random.rand(frequencies)
            for k in range(original_pop_vector.shape[1]):
                newfsignal = fsignal[0, k]
                newfsignal = np.append(newfsignal,
                                       np.exp(1j * phases) * power[:, k])
                newfsignal = np.append(
                    newfsignal, np.flip(np.exp(-1j * phases) * power[:, k]))
                newsignal = sp.fft.ifft(newfsignal)
                rand_pop_vector[:, k] = np.abs(newsignal.real)
            sub_pop_vector = rand_pop_vector

        if cross_validation == 'none':

            # Fit the model on all the data and predict
            clf.fit(sub_pop_vector, event_groups)
            y_pred = clf.predict(sub_pop_vector)

            #  Get the probability of the prediction for ROC analysis
            probs = clf.predict_proba(sub_pop_vector)
            y_probs = probs[:, 1]  # keep positive only

        else:
            # Perform cross-validation
            if cross_validation == 'leave-one-out':
                cv = LeaveOneOut().split(sub_pop_vector)
            elif cross_validation == 'kfold':
                cv = KFold(n_splits=num_splits).split(sub_pop_vector)
            elif cross_validation == 'block':
                block_lengths = [
                    sum(1 for i in g) for k, g in groupby(prob_left)
                ]
                blocks = np.repeat(np.arange(len(block_lengths)),
                                   block_lengths)
                cv = LeaveOneGroupOut().split(sub_pop_vector, groups=blocks)
            elif cross_validation == 'custom':
                cv = custom_validation

            # Loop over the splits into train and test
            for train_index, test_index in cv:

                # Fit the model to the training data
                clf.fit(sub_pop_vector[train_index], event_groups[train_index])

                # Predict the test data
                y_pred[test_index] = clf.predict(sub_pop_vector[test_index])

                # Get the probability of the prediction for ROC analysis
                probs = clf.predict_proba(sub_pop_vector[test_index])
                y_probs[test_index] = probs[:, 1]  # keep positive only

        # Calculate performance metrics and confusion matrix
        acc[i] = accuracy_score(event_groups, y_pred)
        f1[i] = f1_score(event_groups, y_pred)
        auroc[i] = roc_auc_score(event_groups, y_probs)
        conf_matrix = confusion_matrix(event_groups, y_pred)
        conf_matrix_norm[:, :,
                         i] = conf_matrix / conf_matrix.sum(axis=1)[:,
                                                                    np.newaxis]

        # Add prediction and probability to matrix
        pred[i, :] = y_pred
        prob[i, :] = y_probs

    # Make integers from arrays when there's only one iteration
    if iterations == 1:
        acc = acc[0]
        f1 = f1[0]
        auroc = auroc[0]

    # Add to results dictionary
    if cross_validation == 'kfold':
        results = dict({
            'accuracy': acc,
            'f1': f1,
            'auroc': auroc,
            'predictions': pred,
            'probabilities': prob,
            'confusion_matrix': conf_matrix_norm,
            'n_groups': np.shape(np.unique(event_groups))[0],
            'classifier': classifier,
            'cross_validation': '%d-fold' % num_splits,
            'iterations': iterations,
            'shuffle': shuffle
        })

    else:
        results = dict({
            'accuracy': acc,
            'f1': f1,
            'auroc': auroc,
            'predictions': pred,
            'probabilities': prob,
            'confusion_matrix': conf_matrix_norm,
            'n_groups': np.shape(np.unique(event_groups))[0],
            'classifier': classifier,
            'cross_validation': cross_validation,
            'iterations': iterations,
            'shuffle': shuffle
        })
    return results
Beispiel #18
0
 def _extract_training_data(self, training_data):
     X, y = training_data.X, training_data.y
     if self.shuffle:
         X, y = sklearn_shuffle(X, y)
     return X, y
Beispiel #19
0
    def fit(self,
            x,
            y,
            batch_size=1024,
            epochs=50,
            validation_split=0.0,
            validation_data=None,
            val_size=2**18,
            shuffle=True,
            initial_epoch=0,
            min_display=50,
            max_iter=-1):

        if validation_split < 0 or validation_split >= 1:
            raise ValueError(
                "validation_split must be a float number >= 0 and < 1")

        n_samples = x.shape[0]
        iters = (n_samples - 1) // batch_size + 1
        self.tr_loss_list = []
        self.val_loss_list = []
        print(iters, "steps per epoch")
        print(batch_size, "samples per step")
        start_time = time.time()
        stop_flag = False
        self.best_loss = np.inf
        self.best_ckpt = None
        if not validation_data and validation_split > 0:
            x, val_x, y, val_y = train_test_split(x,
                                                  y,
                                                  test_size=validation_split,
                                                  random_state=self.seed)
            validation_data = [(val_x, val_y)]

        for i in range(epochs):
            if i < initial_epoch:
                continue
            if shuffle:
                x, y = sklearn_shuffle(x, y, random_state=self.seed)
            for j in range(iters):
                batch_x = x[j * batch_size:(j + 1) * batch_size]
                batch_y = y[j * batch_size:(j + 1) * batch_size]

                self.train_on_batch(batch_x, batch_y)
                if j % min_display == 0:
                    tr_loss = self.evaluate(x, y, val_size)
                    self.tr_loss_list.append(tr_loss)
                    total_time = time.time() - start_time
                    if validation_data is None:
                        print(
                            "Epoch {0: 2d} Step {1: 4d}: tr_loss {2: 0.6f} tr_time {3: 0.1f}"
                            .format(i, j, tr_loss, total_time))
                    else:
                        val_loss = self.evaluate(validation_data[0][0],
                                                 validation_data[0][1],
                                                 val_size)
                        self.val_loss_list.append(val_loss)
                        print(
                            "Epoch {0: 2d} Step {1: 4d}: tr_loss {2: 0.6f} va_loss {3: 0.6f} tr_time {4: 0.1f}"
                            .format(i, j, tr_loss, val_loss, total_time))

                        if val_loss < self.best_loss:
                            self.best_loss = val_loss
                            # self.save_model(self.checkpoint_path+'best')

                # self.save_model(self.checkpoint_path)

                if (i * iters) + j == max_iter:
                    stop_flag = True
                    break
            if stop_flag:
                break
Beispiel #20
0
            # Check if there are enough neurons in this brain region
            if np.unique(clus_region).shape[0] < MIN_NEURONS:
                continue

            # Get population response matrix of all trials
            dlc_matrix = []  ## INPUT MATRIX HERE

            # Decode
            accuracy = classify(dlc_matric, trial_ids, clf, cv)[0]

            null_iterations = np.empty(ITERATIONS)
            for k in range(ITERATIONS):
                # Estimate chance level
                if CHANCE_LEVEL == 'shuffle':
                    null_iterations[k] = classify(dlc_matrix, sklearn_shuffle(trial_ids),
                                                  clf, cv)[0]
                elif CHANCE_LEVEL == 'pseudo-session':
                    pseudo_trials = generate_pseudo_session(trials)
                    pseudo_incl = (pseudo_trials.probabilityLeft == 0.8) | (pseudo_trials.probabilityLeft == 0.2)
                    trial_times = pseudo_trials.stimOn_times[pseudo_incl]
                    probability_left = pseudo_trials.probabilityLeft[pseudo_incl]
                    pseudo_trial_ids = (pseudo_trials.probabilityLeft[pseudo_incl] == 0.2).astype(int)
                    null_iterations[k] = classify(dlc_matrix, pseudo_trial_ids, clf, cv)[0]
                elif CHANCE_LEVEL == 'none':
                    null_iterations = []
                else:
                    raise Exception('CHANCE_LEVEL must be phase_rand, shuffle or none')

            # Calculate p-value
            p_value = np.sum(null_iterations > accuracy) / null_iterations.shape[0]
Beispiel #21
0
 def _extract_training_data(self, training_data):
     # transform y from one-hot to num_classes
     X, y = training_data.X, training_data.y.argmax(axis=-1)
     if self.shuffle:
         X, y = sklearn_shuffle(X, y)
     return X, y
Beispiel #22
0
 def on_epoch_end(self):
     # Shuffles indices after every epoch
     self.indexes = np.arange(len(self.filenames), dtype=int)
     if self.shuffle:
         self.indexes = sklearn_shuffle(self.indexes,
                                        random_state=self.seed)