コード例 #1
0
def test_balanced_batch_generator_function_no_return_indices():
    with pytest.raises(ValueError, match='needs to return the indices'):
        balanced_batch_generator(X,
                                 y,
                                 sampler=ClusterCentroids(),
                                 batch_size=10,
                                 random_state=42)
コード例 #2
0
 def __run_balanced_batch_generator(self):
     self.generator, self.steps_per_epoch = balanced_batch_generator(
         self.__reshape_data_to_original(),
         self.y,
         sampler=RandomOverSampler(),
         batch_size=self.batch_size,
         keep_sparse=True)
コード例 #3
0
def test_balanced_batch_generator_function(sampler, sample_weight):
    model = _build_keras_model(y.shape[1], X.shape[1])
    training_generator, steps_per_epoch = balanced_batch_generator(
        X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10,
        random_state=42)
    model.fit_generator(generator=training_generator,
                        steps_per_epoch=steps_per_epoch,
                        epochs=10)
コード例 #4
0
def test_balanced_batch_generator_function(sampler, sample_weight):
    model = _build_keras_model(y.shape[1], X.shape[1])
    training_generator, steps_per_epoch = balanced_batch_generator(
        X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10,
        random_state=42)
    model.fit_generator(generator=training_generator,
                        steps_per_epoch=steps_per_epoch,
                        epochs=10)
コード例 #5
0
    def classify(
        self,
        total_epoch_count=30,
        warmup_epoch_count=10
    ):  #, X, type: str, classifier: str, test_prop: float, res: None, res_method: None):

        if self.type == "binary":
            self.train_y[np.where(self.train_y == 1)] = 0
            self.train_y[np.where(self.train_y == 2)] = 1
            self.test_y[np.where(self.test_y == 1)] = 0
            self.test_y[np.where(self.test_y == 2)] = 1

        #log_dir = ".log/movie_reviews/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
        #tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
        training_generator, steps_per_epoch = balanced_batch_generator(
            self.train_x, self.train_y, batch_size=48, random_state=100)
        #total_epoch_count = 30
        # model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y,
        self.model.fit(
            training_generator,
            epochs=total_epoch_count,
            steps_per_epoch=steps_per_epoch,
            # validation_split=0.1,
            callbacks=
            [  # keras.callbacks.LearningRateScheduler(time_decay,verbose=1),
                # lrate,
                self.create_learning_rate_scheduler(
                    max_learn_rate=1e-5,
                    end_learn_rate=5e-8,
                    warmup_epoch_count=warmup_epoch_count,
                    total_epoch_count=total_epoch_count)
                #,

                #keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
                #    tensorboard_callback
            ])

        self.model.save_weights('./movie_reviews.h5', overwrite=True)
        Y_pred_probabilities = self.model.predict(self.test_x)
        Y_pred = np.argmax(Y_pred_probabilities, axis=-1)
        self.pred_y = Y_pred
        # Accuracy Percentage
        print(
            f"Accuracy is {round(accuracy_score(self.test_y, Y_pred), 2)*100}%"
        )

        # Classification Report
        print(classification_report(Y_pred, self.test_y))

        # Matthew's Correlation Coefficient
        print(
            f"Matthew's Correlation Coefficient is {matthews_corrcoef(self.test_y, Y_pred)}"
        )

        # Plots of Confusion Matrix and ROC Curve
        plot_confusion_matrix(self.test_y, Y_pred, figsize=(10, 10))
コード例 #6
0
def test_balanced_batch_generator_function_sparse(keep_sparse):
    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10,
        random_state=42)
    for idx in range(steps_per_epoch):
        X_batch, y_batch = next(training_generator)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
コード例 #7
0
def test_balanced_batch_generator_function_sparse(is_sparse):
    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X), y, sparse=is_sparse, batch_size=10,
        random_state=42)
    for idx in range(steps_per_epoch):
        X_batch, y_batch = next(training_generator)
        if is_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
コード例 #8
0
 def __init__(self, x, y, datagen, batch_size=32):
     self.datagen = datagen
     self.batch_size = batch_size
     self._shape = x.shape
     datagen.fit(x)
     self.gen, self.steps_per_epoch = balanced_batch_generator(
         x.reshape(x.shape[0], -1),
         y,
         sampler=RandomOverSampler(),
         batch_size=self.batch_size,
         keep_sparse=True)
 def __init__(self, x, y, datagen, batch_size=32):
     self.datagen = datagen
     self.batch_size = min(batch_size, x.shape[0])
     # datagen.fit(x)
     self.gen, self.steps_per_epoch = balanced_batch_generator(
         x.reshape(x.shape[0], -1),
         y,
         sampler=RandomOverSampler(random_state=42),
         batch_size=self.batch_size,
         keep_sparse=True)
     self._shape = (self.steps_per_epoch * batch_size, *x.shape[1:])
コード例 #10
0
ファイル: trainer.py プロジェクト: yangxhcaf/sdmdl
    def train_model(self, model, X_train, X_test, y_train, y_test):
        """Training a model to predict the presence or absence of a species. Various instance variables are used to
        define how the model trains, like: batch size, random seed and number of epochs.

        :param model: Keras Model Object. Initialized model ready for training.
        :param X_train: Array. Contains training data.
        :param X_test: Array. Contains testing data.
        :param y_train: Array. Contains training (ground truth) labels.
        :param y_test: Array. Contains testing (ground truth) labels.

        :return: Tuple. Containing:
        float 'AUC' performance metric between 0 and 1 (0 = 100% wrong, 1 = 100% right);
        keras model 'model' a keras model with an identical architecture to the input variable 'model' but with trained
        weights.
        """

        training_generator, steps_per_epoch = balanced_batch_generator(
            X_train,
            y_train,
            sampler=NearMiss(),
            batch_size=self.batch,
            random_state=self.random_seed)
        model.fit_generator(generator=training_generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=self.epoch,
                            verbose=0)
        score = model.evaluate(X_test, y_test, verbose=0)
        predictions = model.predict(X_test)
        fpr, tpr, thresholds = roc_curve(y_test[:, 1], predictions[:, 1])
        len_tpr = int(len(tpr) / 2)
        self.test_loss.append(score[0])
        self.test_acc.append(score[1])
        self.test_AUC.append(roc_auc_score(y_test[:, 1], predictions[:, 1]))
        self.test_tpr.append(tpr[len_tpr])
        AUC = roc_auc_score(y_test[:, 1], predictions[:, 1])
        n_bootstraps = 1000
        y_pred = predictions[:, 1]
        y_true = y_test[:, 1]
        bootstrapped_scores = []
        rng = np.random.RandomState(self.random_seed)
        for i in range(n_bootstraps):
            indices = rng.randint(0, len(y_pred) - 1, len(y_pred))
            if len(np.unique(y_true[indices])) < 2:
                continue
            score = roc_auc_score(y_true[indices], y_pred[indices])
            bootstrapped_scores.append(score)
        sorted_scores = np.array(bootstrapped_scores)
        sorted_scores.sort()
        ci_lower = sorted_scores[int(0.05 * len(sorted_scores))]
        ci_upper = sorted_scores[int(0.95 * len(sorted_scores))]
        self.test_lci.append(ci_lower)
        self.test_uci.append(ci_upper)
        return AUC, model
コード例 #11
0
def test_balanced_batch_generator_function_sparse(data, keep_sparse):
    X, y = data
    training_generator, steps_per_epoch = balanced_batch_generator(
        sparse.csr_matrix(X),
        y,
        keep_sparse=keep_sparse,
        batch_size=10,
        random_state=42)
    for _ in range(steps_per_epoch):
        X_batch, _ = next(training_generator)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
コード例 #12
0
 def __init__(self,
              x,
              y,
              datagen,
              batch_size=32,
              crop_length=224,
              resize=300,
              balance=True):
     self.datagen = datagen
     self.batch_size = batch_size
     self._shape = x.shape
     self.crop_length = crop_length
     self.resize = resize
     datagen.fit(x)
     self.gen, self.steps_per_epoch = balanced_batch_generator(
         x.reshape(x.shape[0], -1),
         y,
         sampler=RandomOverSampler() if balance else None,
         batch_size=self.batch_size)
コード例 #13
0
def test_balanced_batch_generator_function_no_return_indices():
    with pytest.raises(ValueError, match='needs to return the indices'):
        balanced_batch_generator(
            X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42)
コード例 #14
0
    def data_loader(self):
        '''
        This can be used also to evaluate the input variables with AVOVA F-test
        '''
        ## read data from files
        logging.debug("Loading data")
        config_base_dir = os.path.join(self.__dataload_config["base_dir"],
                                       self.__dataload_config["plot_config"])

        # create the model directory
        utc_seconds = str(datetime.datetime.now().timestamp()).split(".")[0]
        logging.info(utc_seconds)
        self.__model_dir = os.path.join(config_base_dir,
                                        self.__dataload_config["cut"], "steps",
                                        utc_seconds)
        os.makedirs(self.__model_dir, exist_ok=True)

        # load numpy
        samples_dir = os.path.join(config_base_dir,
                                   self.__dataload_config["cut"], "samples",
                                   self.__dataload_config["samples_version"])
        signal = pickle.load(
            open(os.path.join(samples_dir, "for_training/signal_balanced.pkl"),
                 "rb"))
        bkg = pickle.load(
            open(
                os.path.join(samples_dir,
                             "for_training/background_balanced.pkl"), "rb"))

        # Keep only the first "input-dim" columns
        self.__dataload_config["cols"] = self.__dataload_config[
            "cols"][:self.__model_config["input_dim"]]
        logging.debug(self.__dataload_config["cols"])

        ## create numpy arrays
        X_sig = signal[self.__dataload_config["cols"]].values
        X_bkg = bkg[self.__dataload_config["cols"]].values
        Y_sig = np.ones(len(X_sig))
        Y_bkg = np.zeros(len(X_bkg))
        W_sig = (signal["weight_norm"]).values
        W_bkg = (bkg["weight_norm"]).values
        Wnn_sig = (signal["weight_"]).values
        Wnn_bkg = (bkg["weight_"]).values

        X = np.vstack([X_sig, X_bkg])
        Y = np.hstack([Y_sig, Y_bkg])
        W = np.hstack([W_sig, W_bkg])
        Wnn = np.hstack([Wnn_sig, Wnn_bkg])

        ## import scaler configuration
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pickle.dump(scaler, open(f"{self.__model_dir}/scaler_model.pkl", "wb"))

        ## Balance
        X_train, X_test, y_train, y_test, W_train, W_test, Wnn_train, Wnn_test = train_test_split(
            X_scaled,
            Y,
            W,
            Wnn,
            test_size=self.__dataload_config["test_ratio"],
            random_state=42,
            stratify=Y)
        X_train, X_val, y_train, y_val, W_train, W_val, Wnn_train, Wnn_val = train_test_split(
            X_train,
            y_train,
            W_train,
            Wnn_train,
            test_size=self.__dataload_config["val_ratio"],
            random_state=42,
            stratify=y_train)

        data_split = {
            "X_train": X_train,
            "X_test": X_test,
            "X_val": X_val,
            "y_train": y_train,
            "y_test": y_test,
            "y_val": y_val,
            "W_train": W_train,
            "W_test": W_test,
            "W_val": W_val,
            "Wnn_train": Wnn_train,
            "Wnn_test": Wnn_test,
            "Wnn_val": Wnn_val,
        }

        # for dataset_name, dataset in data_split.items():
        #     print(dataset_name + " " + str(float(dataset.nbytes) / (1024. * 1024.)))

        ## Oversampling
        training_generator, steps_per_epoch_train = balanced_batch_generator(
            X_train,
            y_train,
            W_train,
            batch_size=self.__model_config["batch_size"],
            sampler=RandomOverSampler())
        #validation_generator, steps_per_epoch_val   = balanced_batch_generator(X_val,   y_val,   W_val,   batch_size=self.__model_config["batch_size"], sampler=RandomOverSampler()) ## test != val
        validation_generator, steps_per_epoch_val = balanced_batch_generator(
            X_val,
            y_val,
            W_val,
            batch_size=self.__model_config["batch_size"],
            sampler=RandomOverSampler())  ## test == val

        generators = {
            "training_generator": training_generator,
            "steps_per_epoch_train": steps_per_epoch_train,
            "validation_generator": validation_generator,
            "steps_per_epoch_val": steps_per_epoch_val,
        }

        return data_split, generators
コード例 #15
0
def test_balanced_batch_generator_function_no_return_indices(data):
    with pytest.raises(ValueError, match='needs to have an attribute'):
        balanced_batch_generator(
            *data, sampler=ClusterCentroids(), batch_size=10, random_state=42)
コード例 #16
0
def test_balanced_batch_generator_function_no_return_indices(data):
    with pytest.raises(ValueError, match="needs to have an attribute"):
        balanced_batch_generator(*data,
                                 sampler=ClusterCentroids(),
                                 batch_size=10,
                                 random_state=42)
コード例 #17
0
def train(_train_data,
          test_data,
          embedding_matrix,
          batch_size=BATCH_SIZE,
          epoch_count=100,
          max_length=MAX_LENGTH,
          model_name="model",
          learning_rate=LR):
    random.shuffle(_train_data)
    model_filepath = f"../model/model_{model_name}.h5"
    result_dir = f"../result/{model_name}"
    ml.save_hyparameters(result_dir, model_name, epoch_count, batch_size,
                         learning_rate)

    train_data = []
    validation_data = []

    validation_data_rate = 0.25  # validationの割合

    categorized_data = {
        category_dict[category]: []
        for category in CATEGORIES
    }  # カテゴリごとに分けられたデータ
    for target_value, input_value in _train_data:
        categorized_data[target_value].append((target_value, input_value))

    # train, validation, test データのカテゴリ組成を等しくする
    for category_id in categorized_data:
        data_len = len(categorized_data[category_id])
        validation_boundary = int(data_len * validation_data_rate)

        validation_data += categorized_data[category_id][0:validation_boundary]
        train_data += categorized_data[category_id][validation_boundary:]

    test_inputs = []
    test_targets = []
    train_inputs = []
    train_targets = []
    validation_inputs = []
    validation_targets = []

    for target_value, input_value in test_data:
        test_inputs.append(input_value)
        test_targets.append([
            1 if i == target_value else 0 for i in range(len(category_dict))
        ])  # 正解ラベルだけ1にした配列

    for target_value, input_value in validation_data:
        validation_inputs.append(input_value)
        validation_targets.append([
            1 if i == target_value else 0 for i in range(len(category_dict))
        ])  # 正解ラベルだけ1にした配列

    for target_value, input_value in train_data:
        train_inputs.append(input_value)
        train_targets.append([
            1 if i == target_value else 0 for i in range(len(category_dict))
        ])  # 正解ラベルだけ1にした配列

    # カテゴリごとに含まれる数を表示
    count_by_categories = {
        key: np.concatenate([
            np.argmax(test_targets, 1),
            np.argmax(validation_targets, 1),
            np.argmax(train_targets, 1)
        ]).tolist().count(category_dict[key])
        for key in CATEGORIES
    }
    test_count_by_categories = {
        key: np.argmax(test_targets, 1).tolist().count(category_dict[key])
        for key in CATEGORIES
    }
    validation_count_by_categories = {
        key: np.argmax(validation_targets,
                       1).tolist().count(category_dict[key])
        for key in CATEGORIES
    }
    train_count_by_categories = {
        key: np.argmax(train_targets, 1).tolist().count(category_dict[key])
        for key in CATEGORIES
    }
    print("ALL: ", count_by_categories)
    print("TRAIN: ", train_count_by_categories)
    print("VALIDATION: ", validation_count_by_categories)
    print("TEST: ", test_count_by_categories)

    test_inputs = np.array(test_inputs)
    test_targets = np.array(test_targets)
    train_inputs = np.array(train_inputs)
    train_targets = np.array(train_targets)
    validation_inputs = np.array(validation_inputs)
    validation_targets = np.array(validation_targets)

    # Bootstrap Sampling
    train_inputs, train_targets = resample(train_inputs,
                                           train_targets,
                                           n_samples=len(train_inputs))

    # 単語数、embeddingの次元
    num_words, word_vec_size = embedding_matrix.shape
    # モデルの構築
    model = Sequential([
        Embedding(num_words,
                  word_vec_size,
                  weights=[embedding_matrix],
                  input_length=max_length,
                  trainable=False,
                  mask_zero=True),
        Conv1D(16, 5, activation='relu'),
        AveragePooling1D(7),
        Dropout(0.5),
        Conv1D(16, 5, activation='relu'),
        GlobalAveragePooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(len(CATEGORIES), activation='softmax')
    ])

    model.summary()

    #Embedding層は学習しないようする
    model.layers[0].trainable = False

    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate),
                  metrics=['accuracy'])

    # model.compile(loss='categorical_crossentropy',
    #           optimizer=keras.optimizers.rmsprop(learning_rate),
    #           metrics=['accuracy'])

    # checkpointの設定
    checkpoint = ModelCheckpoint(
        filepath=model_filepath,
        monitor='val_loss',
        save_best_only=True,
        period=1,
    )

    # 学習率を少しずつ下げるようにする
    # start = learning_rate
    # stop = learning_rate * 0.1
    # learning_rates = np.linspace(start, stop, epoch_count)

    # データの不均衡性への対策
    training_generator, steps_per_epoch = balanced_batch_generator(
        train_inputs, train_targets, batch_size=batch_size, random_state=42)
    # validation_generator, validation_steps = balanced_batch_generator(
    #     validation_inputs, validation_targets, batch_size=batch_size, random_state=42)

    # 学習
    history = model.fit_generator(
        generator=training_generator,
        steps_per_epoch=steps_per_epoch,
        epochs=epoch_count,
        verbose=1,
        validation_data=(validation_inputs, validation_targets),
        # validation_data=validation_generator,
        # validation_steps=validation_steps,
        # callbacks=[checkpoint, LearningRateScheduler(lambda epoch: learning_rates[epoch])],
        callbacks=[checkpoint],
        shuffle=True)

    # 最良の結果を残したモデルを読み込む
    model = load_model(model_filepath)

    ml.model_evaluate(model, test_inputs, test_targets, result_dir)
    ml.classification_evaluate(model, test_inputs, test_targets, result_dir)
    ml.visualize_model(model, result_dir)
    ml.save_history(history, result_dir)
コード例 #18
0
        return model


# Model Fitting

baseline = MiniVGGNet('baseline', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS)

# takes some time to run, depending on hardware (resume here)
baseline.fit(X_train, y_train_bin, X_test, y_test_bin)

ros = MiniVGGNet('baseline_ros', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS)

# ValueError: Found array with dim 4. Estimator expected <= 2.
ros_generator, steps_per_epoch_ros = balanced_batch_generator(
    X_train,
    y_train_bin,
    sampler=RandomOverSampler(),
    batch_size=BS,
    random_state=0)

ros.fit_generator(X_train, y_train_bin, X_test, y_test_bin, ros_generator, steps_per_epoch_ros)

img_datagen = MiniVGGNet('baseline_datagen', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS)

img_data_generator = ImageDataGenerator(
    rotation_range=25,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest")