Beispiel #1
0
def create_h5_data(input_dir, output_path, train_yaml, model_yaml):
    """
    #TODO be careful if you change of normalization
    Args:
        input_dir: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format
        output_path: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format
        train_yaml: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format
        model_yaml: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format

    Returns:

    """
    data_X, data_y, scale_dict_train = load_data(
        input_dir,
        x_shape=model_yaml["input_shape"],
        label_shape=model_yaml["dim_gt_image"],
        normalization=train_yaml["normalization"],
        dict_band_X=train_yaml["dict_band_x"],
        dict_band_label=train_yaml["dict_band_label"],
        dict_rescale_type=train_yaml["dict_rescale_type"],
        fact_s2=train_yaml["s2_scale"],
        fact_s1=train_yaml["s1_scale"],
        s2_bands=train_yaml["s2bands"],
        s1_bands=train_yaml["s1bands"],
        lim=train_yaml["lim_train_tile"])

    hf = h5py.File(output_path, 'w')
    hf.create_dataset('data_X', data=data_X)
    hf.create_dataset('data_y', data=data_y)
    hf.close()
    return output_path
Beispiel #2
0
def main(test_name, dataset_dir, input_dataset):
    print("[INFO] test {} is going to be runned".format(test_name))
    if test_name == "read_csv_stats":
        tile_id = extract_tile_id(
            find_image_indir(input_dataset + XDIR, "npy")[0])
        path_csv = find_csv(dataset_dir, "B2")
        val_min, val_max = get_minmax_fromcsv(
            tile_id.split(".")[0] + ".tif", path_csv, "B2")
        print("TEST for image {} the min_max from csv is {}".format(
            tile_id, (val_min, val_max)))

        print("load_from_dir function")
        data_array, path_tile, ldict_stat = load_from_dir(
            input_dataset + XDIR, DICT_SHAPE[XDIR])
        assert ldict_stat is not None, "Wrong output should be a list"
        assert type(ldict_stat) == type(
            []), "The ouput of the function should be a list not {}".format(
                type(ldict_stat))
        assert data_array.shape[0] == len(
            ldict_stat
        ), "The batch size and the len of ldict_stat dos not match {}".format(
            len(ldict_stat))
        print(ldict_stat)
        assert type(ldict_stat[0]) == type(
            {}), "Inside the list should be dict not {}".format(ldict_stat[0])
        print("[TEST] load_data function")
        dataX, data_label = load_data(input_dataset,
                                      x_shape=None,
                                      label_shape=None,
                                      normalization=True,
                                      dict_band_X=None,
                                      dict_band_label=None,
                                      dict_rescale_type=None)
        print("Using the csv stats for s2 and normalize",
              np.mean(dataX[0, :, :, 4]), np.mean(data_label[0, :, :, 0]))
        dataX, data_label = load_data(input_dataset,
                                      x_shape=None,
                                      label_shape=None,
                                      normalization=True,
                                      dict_band_X=None,
                                      dict_band_label=None,
                                      dict_rescale_type=None)
        print("Using the previous normalization method",
              np.mean(dataX[0, :, :, 4]), np.mean(data_label[0, :, :, 0]))
Beispiel #3
0
    def __init__(self, train_yaml, model_yaml, sess):
        """:param train_yaml,model_yaml two dictionnaries"""
        self.k_step = train_yaml["k_step"]
        print(train_yaml)
        print(model_yaml)
        # SHAPE PARAMETER
        self.img_rows = 256
        self.img_cols = 256
        self.channels = CHANNEL
        self.img_shape = (self.img_rows, self.img_cols, self.channels)
        print(type(train_yaml["lr"]))
        # PATH
        self.model_name = model_yaml["model_name"]
        self.model_dir = train_yaml["training_dir"] + self.model_name + "/"
        self.this_training_dir = self.model_dir + "training_{}/".format(
            train_yaml["training_number"])
        self.saving_image_path = self.this_training_dir + "saved_training_images/"
        self.saving_logs_path = self.this_training_dir + "logs/"
        self.checkpoint_dir = self.this_training_dir + "checkpoints/"
        # TRAIN PARAMETER
        self.epoch = train_yaml["epoch"]
        self.batch_size = train_yaml["batch_size"]
        self.sess = sess
        self.learning_rate = train_yaml["lr"]
        self.fact_g_lr = train_yaml["fact_g_lr"]
        self.beta1 = train_yaml["beta1"]
        self.data_X, self.data_y = load_data(train_yaml["train_directory"])

        self.num_batches = self.data_X.shape[0] // self.batch_size
        self.model_yaml = model_yaml
        self.saving_step = train_yaml["im_saving_step"]

        # LOSSES
        self.wasserstein = train_yaml["wasserstein"]
        if self.wasserstein:
            self.generator_loss = load_loss("wasser_gene_loss")
            self.discriminator_loss = load_loss("wasser_discri_loss")
        else:
            self.generator_loss = load_loss(train_yaml["generator_loss"])
            self.discriminator_loss = load_loss(
                train_yaml["discriminator_loss"])
        print(self.discriminator_loss)
        # test
        self.sample_num = train_yaml[
            "n_train_image_saved"]  # number of generated images to be saved

        # REDUCE THE DISCRIMINATOR PERFORMANCE
        self.val_lambda = train_yaml["lambda"]
        self.real_label_smoothing = tuple(train_yaml["real_label_smoothing"])
        self.fake_label_smoothing = tuple(train_yaml["fake_label_smoothing"])
        self.sigma_init = train_yaml["sigma_init"]
        self.sigma_step = train_yaml['sigma_step']
        self.sigma_decay = train_yaml["sigma_decay"]
        self.ite_train_g = train_yaml["train_g_multiple_time"]
def create_h5_data(input_dir, output_path, train_yaml, model_yaml):
    data_X, data_y, scale_dict_train = load_data(
        input_dir,
        x_shape=model_yaml["input_shape"],
        label_shape=model_yaml["dim_gt_image"],
        normalization=train_yaml["normalization"],
        dict_band_X=train_yaml["dict_band_x"],
        dict_band_label=train_yaml["dict_band_label"],
        dict_rescale_type=train_yaml["dict_rescale_type"],
        fact_s2=train_yaml["s2_scale"],
        fact_s1=train_yaml["s1_scale"],
        s2_bands=train_yaml["s2bands"],
        s1_bands=train_yaml["s1bands"],
        lim=train_yaml["lim_train_tile"])

    hf = h5py.File(output_path, 'w')
    hf.create_dataset('data_X', data=data_X)
    hf.create_dataset('data_y', data=data_y)
    hf.close()
    return output_path
Beispiel #5
0
    plt.savefig(history_img_path)
    plt.show()


if __name__ == "__main__":
    # 获取训练配置和语料信息
    configs = get_config()
    dataset_information = get_dataset_information(configs["preprocess"]["dataset_information_path"])

    epochs = configs["train"]["train_epochs"]
    data_path = configs["train"]["data_path"]
    num_examples = configs["train"]["num_examples"]
    dataset_name = configs["preprocess"]["dataset_name"]

    # 加载训练数据
    train_audio_data_path_list, train_text_list = load_data(dataset_name, data_path, num_examples)

    valid_data_path = configs["valid"]["data_path"]
    # 是否含有验证valid数据集,若有则加载,若没有,则将train数据按比例切分一部分为valid数据
    if valid_data_path:
        valid_num_examples = configs["valid"]["num_examples"]
        valid_audio_data_path_list, valid_text_list = load_data(dataset_name,
                                                                valid_data_path,
                                                                valid_num_examples)
    else:
        valid_percent = configs["valid"]["valid_percent"]
        pos = ceil(len(train_audio_data_path_list) * valid_percent / 100)
        valid_audio_data_path_list, valid_text_list = train_audio_data_path_list[-pos:], train_text_list[-pos:]
        train_audio_data_path_list, train_text_list = train_audio_data_path_list[:-pos], train_text_list[:-pos]

    # 构建train_data和valid_data
Beispiel #6
0
def save_model(model_, path):
    """

    :param model_: trained model
    :param path: path for saving model
    :return: None
    """
    model_.save_model(path)


if __name__ == '__main__':
    args = parser.parse_args()
    path_to_dir = args.dir_path if args.dir_path[
        -1] == '/' else args.dir_path + '/'
    # загрузка обучающей и валидационной выборки
    Xtrain, ytrain = load_data(path_to_dir + 'data_split/train.csv')
    Xval, yval = load_data(path_to_dir + 'data_split/val.csv')
    # загрузка файла с метками и соответствующими им классами
    with open(path_to_dir + 'data_split/attacks_lable.json') as file:
        attacks = json.load(file)
    # определение гиперпараметров модели
    model_params = dict(boosting_type=args.boosting_type,
                        objective='multiclass',
                        num_class=len(attacks.values()),
                        num_leaves=args.num_leaves,
                        learning_rate=args.learning_rate,
                        feature_fraction=args.feature_fraction,
                        bagging_fraction=args.bagging_fraction,
                        bagging_freq=args.bagging_freq,
                        verbose=args.verbose)
    train_params = dict(num_boost_round=args.num_boost_round,
Beispiel #7
0
    fig.tight_layout()
    fig.savefig(dir_path + 'plots/confusion_matrix_booster.png')


# функция для построения графика важности признаков модели
def plot_feature_importance(model_, num_features, dir_path):
    """

    :param model_: trained lgbm model
    :param num_features: Max number of features to plot
    :return: None
    """
    fig, ax = plt.subplots(figsize=(15, 15))
    lgb.plot_importance(model_, ax=ax, max_num_features=num_features)
    fig.savefig(dir_path + 'plots/feature_importance_booster.png')


if __name__ == '__main__':
    args = parser.parse_args()
    path_to_dir = args.dir_path if args.dir_path[
        -1] == '/' else args.dir_path + '/'
    # загрузка сохраненой модели
    model = lgb.Booster(model_file=path_to_dir + 'model/booster.txt')
    # загрузка тестовой выборки
    Xtest, ytest = load_data(path_to_dir + 'data_split/test.csv')
    # формирование вектора предсказаний
    predictions = predict_(model, Xtest)
    # построение матрицы несоответствий и графика важности признаков
    test_results(ytest, predictions, path_to_dir)
    plot_feature_importance(model, args.num_features_importance, path_to_dir)
    def predict_on_iter(self,
                        batch,
                        path_save,
                        l_image_id=None,
                        un_rescale=True):
        """given an iter load the model at this iteration, returns the a predicted_batch but check if image have been saved at this directory
        :param dataset:
        :param batch could be a string : path to the dataset  or an array corresponding to the batch we are going to predict on
        """
        if type(batch) == type(
                "u"
        ):  # the param is an string we load the bathc from this directory
            #print("We load our data from {}".format(batch))

            l_image_id = find_image_indir(batch + XDIR, "npy")
            batch, _ = load_data(batch,
                                 x_shape=self.model_yaml["input_shape"],
                                 label_shape=self.model_yaml["dim_gt_image"],
                                 normalization=self.normalization,
                                 dict_band_X=self.dict_band_X,
                                 dict_band_label=self.dict_band_label,
                                 dict_rescale_type=self.dict_rescale_type,
                                 dict_scale=self.scale_dict_train,
                                 fact_s2=self.fact_s2,
                                 fact_s1=self.fact_s1,
                                 s2_bands=self.s2bands,
                                 s1_bands=self.s1bands,
                                 clip_s2=False)
        else:
            if l_image_id is None:
                print("We defined our own index for image name")
                l_image_id = [i for i in range(batch.shape[0])]
        assert len(l_image_id) == batch.shape[
            0], "Wrong size of the name of the images is {} should be {} ".format(
                len(l_image_id), batch.shape[0])
        if os.path.isdir(path_save):
            print(
                "[INFO] the directory where to store the image already exists")
            data_array, path_tile, _ = load_from_dir(
                path_save, self.model_yaml["dim_gt_image"])
            return data_array
        else:
            create_safe_directory(path_save)
            batch_res = self.generator.predict(batch)
            # if un_rescale:  # remove the normalization made on the data

            # _, batch_res, _ = rescale_array(batch, batch_res, dict_group_band_X=self.dict_band_X,
            #                                 dict_group_band_label=self.dict_band_label,
            #                                 dict_rescale_type=self.dict_rescale_type,
            #                                 dict_scale=self.scale_dict_train, invert=True, fact_scale2=self.fact_s2,
            #                                 fact_scale1=self.fact_s1,clip_s2=False)
            assert batch_res.shape[0] == batch.shape[
                0], "Wrong prediction should have shape {} but has shape {}".format(
                    batch_res.shape, batch.shape)
            if path_save is not None:
                # we store the data at path_save
                for i in range(batch_res.shape[0]):
                    np.save(
                        "{}_image_{}".format(path_save,
                                             l_image_id[i].split("/")[-1]),
                        batch_res[i, :, :, :])
        return batch_res
    def __init__(self, model_yaml, train_yaml):
        """

        Args:
            model_yaml: dictionnary with the model parameters
            train_yaml: dictionnary the tran parameters
        """
        self.sigma_val = 0
        self.model_yaml = model_yaml
        self.img_rows = 28
        self.img_cols = 28
        self.channels = 1
        self.img_shape = (self.img_rows, self.img_cols, self.channels)
        if "dict_band_x" not in train_yaml:
            self.dict_band_X = None
            self.dict_band_label = None
            self.dict_rescale_type = None
        else:
            self.dict_band_X = train_yaml["dict_band_x"]
            self.dict_band_label = train_yaml["dict_band_label"]
            self.dict_rescale_type = train_yaml["dict_rescale_type"]
        self.s1bands = train_yaml["s1bands"]
        self.s2bands = train_yaml["s2bands"]
        # self.latent_dim = 100
        # PATH
        self.model_name = model_yaml["model_name"]
        self.model_dir = train_yaml["training_dir"] + self.model_name + "/"
        self.this_training_dir = self.model_dir + "training_{}/".format(
            train_yaml["training_number"])
        self.saving_image_path = self.this_training_dir + "saved_training_images/"
        self.saving_logs_path = self.this_training_dir + "logs/"
        self.checkpoint_dir = self.this_training_dir + "checkpoints/"
        self.previous_checkpoint = train_yaml["load_model"]
        # TRAIN PARAMETER
        self.normalization = train_yaml["normalization"]
        self.epoch = train_yaml["epoch"]
        self.batch_size = train_yaml["batch_size"]
        # self.sess = sess
        self.learning_rate = train_yaml["lr"]
        self.fact_g_lr = train_yaml["fact_g_lr"]
        self.beta1 = train_yaml["beta1"]
        self.val_directory = train_yaml["val_directory"]
        self.fact_s2 = train_yaml["s2_scale"]
        self.fact_s1 = train_yaml["s1_scale"]
        self.data_X, self.data_y, self.scale_dict_train = load_data(
            train_yaml["train_directory"],
            x_shape=model_yaml["input_shape"],
            label_shape=model_yaml["dim_gt_image"],
            normalization=self.normalization,
            dict_band_X=self.dict_band_X,
            dict_band_label=self.dict_band_label,
            dict_rescale_type=self.dict_rescale_type,
            fact_s2=self.fact_s2,
            fact_s1=self.fact_s1,
            s2_bands=self.s2bands,
            s1_bands=self.s1bands,
            lim=train_yaml["lim_train_tile"])
        self.val_X, self.val_Y, scale_dict_val = load_data(
            self.val_directory,
            x_shape=model_yaml["input_shape"],
            label_shape=model_yaml["dim_gt_image"],
            normalization=self.normalization,
            dict_band_X=self.dict_band_X,
            dict_band_label=self.dict_band_label,
            dict_rescale_type=self.dict_rescale_type,
            dict_scale=self.scale_dict_train,
            fact_s2=self.fact_s2,
            fact_s1=self.fact_s1,
            s2_bands=self.s2bands,
            s1_bands=self.s1bands,
            lim=train_yaml["lim_val_tile"])
        print("Loading the data done dataX {} dataY {}".format(
            self.data_X.shape, self.data_y.shape))
        self.gpu = train_yaml["n_gpu"]
        self.num_batches = self.data_X.shape[0] // self.batch_size
        self.model_yaml = model_yaml
        self.im_saving_step = train_yaml["im_saving_step"]
        self.w_saving_step = train_yaml["weights_saving_step"]
        self.val_metric_step = train_yaml["metric_step"]
        # REDUCE THE DISCRIMINATOR PERFORMANCE
        self.val_lambda = train_yaml["lambda"]
        self.real_label_smoothing = tuple(train_yaml["real_label_smoothing"])
        self.fake_label_smoothing = tuple(train_yaml["fake_label_smoothing"])
        self.sigma_init = train_yaml["sigma_init"]
        self.sigma_step = train_yaml['sigma_step']
        self.sigma_decay = train_yaml["sigma_decay"]
        self.ite_train_g = train_yaml["train_g_multiple_time"]

        self.max_im = 10
        self.strategy = tf.distribute.MirroredStrategy()
        print('Number of devices: {}'.format(
            self.strategy.num_replicas_in_sync))
        self.buffer_size = self.data_X.shape[0]

        self.global_batch_size = self.batch_size * self.strategy.num_replicas_in_sync
        with self.strategy.scope():
            self.d_optimizer = Adam(self.learning_rate, self.beta1)
            self.g_optimizer = Adam(self.learning_rate * self.fact_g_lr,
                                    self.beta1)

            self.build_model()

        self.model_writer = tf.summary.create_file_writer(
            self.saving_logs_path)
Beispiel #10
0
import sys
sys.path.append("..")
from utils.load_dataset import load_data
from utils.audio_process import get_max_audio_length
from utils.text_process import get_process_text_list, get_max_label_length, tokenize

if __name__ == "__main__":
    configs = get_config()

    dataset_name = configs["preprocess"]["dataset_name"]
    data_path = configs["train"]["data_path"]
    text_row_style = configs["preprocess"]["text_row_style"]
    num_examples = configs["train"]["num_examples"]

    # 获取语料里所有语音路径list和文本list
    audio_data_path_list, text_list = load_data(dataset_name, data_path,
                                                num_examples)

    # 基于文本按照某种mode切分文本
    mode = configs["preprocess"]["text_process_mode"]
    process_text_list = get_process_text_list(text_list, mode)

    # 将文本处理成对应的token数字序列
    text_int_sequences, tokenizer = tokenize(process_text_list)

    # 获取音频和文本的最大length,从而进行数据补齐
    audio_feature_type = configs["other"]["audio_feature_type"]
    max_input_length = get_max_audio_length(audio_data_path_list,
                                            audio_feature_type)
    max_label_length = get_max_label_length(text_int_sequences)

    # 将数据集的相关信息写入dataset_information.json文件
Beispiel #11
0
    # 加载模型检查点
    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
    manager = tf.train.CheckpointManager(
        checkpoint,
        directory=configs["checkpoint"]['directory'],
        max_to_keep=configs["checkpoint"]['max_to_keep'])
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)

    dataset_name = configs["preprocess"]["dataset_name"]
    test_data_path = configs["test"]["data_path"]
    num_examples = configs["test"]["num_examples"]

    # 加载测试集数据(audio_data_path_list, text_list)
    test_data = load_data(dataset_name, test_data_path, num_examples)

    batch_size = configs["test"]["batch_size"]
    batchs = ceil(len(test_data[0]) / batch_size)
    audio_feature_type = configs["other"]["audio_feature_type"]
    max_input_length = dataset_information["max_input_length"]

    # 构建测试数据生成器
    test_data_generator = test_generator(test_data, batchs, batch_size,
                                         audio_feature_type, max_input_length)

    # 获取index_word
    index_word = dataset_information["index_word"]
    text_process_mode = configs["preprocess"]["text_process_mode"]

    # 计算指标并打印