def create_h5_data(input_dir, output_path, train_yaml, model_yaml): """ #TODO be careful if you change of normalization Args: input_dir: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format output_path: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format train_yaml: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format model_yaml: For all npy tile in a directory, save them with the normalization parameter into a .hpy5 format Returns: """ data_X, data_y, scale_dict_train = load_data( input_dir, x_shape=model_yaml["input_shape"], label_shape=model_yaml["dim_gt_image"], normalization=train_yaml["normalization"], dict_band_X=train_yaml["dict_band_x"], dict_band_label=train_yaml["dict_band_label"], dict_rescale_type=train_yaml["dict_rescale_type"], fact_s2=train_yaml["s2_scale"], fact_s1=train_yaml["s1_scale"], s2_bands=train_yaml["s2bands"], s1_bands=train_yaml["s1bands"], lim=train_yaml["lim_train_tile"]) hf = h5py.File(output_path, 'w') hf.create_dataset('data_X', data=data_X) hf.create_dataset('data_y', data=data_y) hf.close() return output_path
def main(test_name, dataset_dir, input_dataset): print("[INFO] test {} is going to be runned".format(test_name)) if test_name == "read_csv_stats": tile_id = extract_tile_id( find_image_indir(input_dataset + XDIR, "npy")[0]) path_csv = find_csv(dataset_dir, "B2") val_min, val_max = get_minmax_fromcsv( tile_id.split(".")[0] + ".tif", path_csv, "B2") print("TEST for image {} the min_max from csv is {}".format( tile_id, (val_min, val_max))) print("load_from_dir function") data_array, path_tile, ldict_stat = load_from_dir( input_dataset + XDIR, DICT_SHAPE[XDIR]) assert ldict_stat is not None, "Wrong output should be a list" assert type(ldict_stat) == type( []), "The ouput of the function should be a list not {}".format( type(ldict_stat)) assert data_array.shape[0] == len( ldict_stat ), "The batch size and the len of ldict_stat dos not match {}".format( len(ldict_stat)) print(ldict_stat) assert type(ldict_stat[0]) == type( {}), "Inside the list should be dict not {}".format(ldict_stat[0]) print("[TEST] load_data function") dataX, data_label = load_data(input_dataset, x_shape=None, label_shape=None, normalization=True, dict_band_X=None, dict_band_label=None, dict_rescale_type=None) print("Using the csv stats for s2 and normalize", np.mean(dataX[0, :, :, 4]), np.mean(data_label[0, :, :, 0])) dataX, data_label = load_data(input_dataset, x_shape=None, label_shape=None, normalization=True, dict_band_X=None, dict_band_label=None, dict_rescale_type=None) print("Using the previous normalization method", np.mean(dataX[0, :, :, 4]), np.mean(data_label[0, :, :, 0]))
def __init__(self, train_yaml, model_yaml, sess): """:param train_yaml,model_yaml two dictionnaries""" self.k_step = train_yaml["k_step"] print(train_yaml) print(model_yaml) # SHAPE PARAMETER self.img_rows = 256 self.img_cols = 256 self.channels = CHANNEL self.img_shape = (self.img_rows, self.img_cols, self.channels) print(type(train_yaml["lr"])) # PATH self.model_name = model_yaml["model_name"] self.model_dir = train_yaml["training_dir"] + self.model_name + "/" self.this_training_dir = self.model_dir + "training_{}/".format( train_yaml["training_number"]) self.saving_image_path = self.this_training_dir + "saved_training_images/" self.saving_logs_path = self.this_training_dir + "logs/" self.checkpoint_dir = self.this_training_dir + "checkpoints/" # TRAIN PARAMETER self.epoch = train_yaml["epoch"] self.batch_size = train_yaml["batch_size"] self.sess = sess self.learning_rate = train_yaml["lr"] self.fact_g_lr = train_yaml["fact_g_lr"] self.beta1 = train_yaml["beta1"] self.data_X, self.data_y = load_data(train_yaml["train_directory"]) self.num_batches = self.data_X.shape[0] // self.batch_size self.model_yaml = model_yaml self.saving_step = train_yaml["im_saving_step"] # LOSSES self.wasserstein = train_yaml["wasserstein"] if self.wasserstein: self.generator_loss = load_loss("wasser_gene_loss") self.discriminator_loss = load_loss("wasser_discri_loss") else: self.generator_loss = load_loss(train_yaml["generator_loss"]) self.discriminator_loss = load_loss( train_yaml["discriminator_loss"]) print(self.discriminator_loss) # test self.sample_num = train_yaml[ "n_train_image_saved"] # number of generated images to be saved # REDUCE THE DISCRIMINATOR PERFORMANCE self.val_lambda = train_yaml["lambda"] self.real_label_smoothing = tuple(train_yaml["real_label_smoothing"]) self.fake_label_smoothing = tuple(train_yaml["fake_label_smoothing"]) self.sigma_init = train_yaml["sigma_init"] self.sigma_step = train_yaml['sigma_step'] self.sigma_decay = train_yaml["sigma_decay"] self.ite_train_g = train_yaml["train_g_multiple_time"]
def create_h5_data(input_dir, output_path, train_yaml, model_yaml): data_X, data_y, scale_dict_train = load_data( input_dir, x_shape=model_yaml["input_shape"], label_shape=model_yaml["dim_gt_image"], normalization=train_yaml["normalization"], dict_band_X=train_yaml["dict_band_x"], dict_band_label=train_yaml["dict_band_label"], dict_rescale_type=train_yaml["dict_rescale_type"], fact_s2=train_yaml["s2_scale"], fact_s1=train_yaml["s1_scale"], s2_bands=train_yaml["s2bands"], s1_bands=train_yaml["s1bands"], lim=train_yaml["lim_train_tile"]) hf = h5py.File(output_path, 'w') hf.create_dataset('data_X', data=data_X) hf.create_dataset('data_y', data=data_y) hf.close() return output_path
plt.savefig(history_img_path) plt.show() if __name__ == "__main__": # 获取训练配置和语料信息 configs = get_config() dataset_information = get_dataset_information(configs["preprocess"]["dataset_information_path"]) epochs = configs["train"]["train_epochs"] data_path = configs["train"]["data_path"] num_examples = configs["train"]["num_examples"] dataset_name = configs["preprocess"]["dataset_name"] # 加载训练数据 train_audio_data_path_list, train_text_list = load_data(dataset_name, data_path, num_examples) valid_data_path = configs["valid"]["data_path"] # 是否含有验证valid数据集,若有则加载,若没有,则将train数据按比例切分一部分为valid数据 if valid_data_path: valid_num_examples = configs["valid"]["num_examples"] valid_audio_data_path_list, valid_text_list = load_data(dataset_name, valid_data_path, valid_num_examples) else: valid_percent = configs["valid"]["valid_percent"] pos = ceil(len(train_audio_data_path_list) * valid_percent / 100) valid_audio_data_path_list, valid_text_list = train_audio_data_path_list[-pos:], train_text_list[-pos:] train_audio_data_path_list, train_text_list = train_audio_data_path_list[:-pos], train_text_list[:-pos] # 构建train_data和valid_data
def save_model(model_, path): """ :param model_: trained model :param path: path for saving model :return: None """ model_.save_model(path) if __name__ == '__main__': args = parser.parse_args() path_to_dir = args.dir_path if args.dir_path[ -1] == '/' else args.dir_path + '/' # загрузка обучающей и валидационной выборки Xtrain, ytrain = load_data(path_to_dir + 'data_split/train.csv') Xval, yval = load_data(path_to_dir + 'data_split/val.csv') # загрузка файла с метками и соответствующими им классами with open(path_to_dir + 'data_split/attacks_lable.json') as file: attacks = json.load(file) # определение гиперпараметров модели model_params = dict(boosting_type=args.boosting_type, objective='multiclass', num_class=len(attacks.values()), num_leaves=args.num_leaves, learning_rate=args.learning_rate, feature_fraction=args.feature_fraction, bagging_fraction=args.bagging_fraction, bagging_freq=args.bagging_freq, verbose=args.verbose) train_params = dict(num_boost_round=args.num_boost_round,
fig.tight_layout() fig.savefig(dir_path + 'plots/confusion_matrix_booster.png') # функция для построения графика важности признаков модели def plot_feature_importance(model_, num_features, dir_path): """ :param model_: trained lgbm model :param num_features: Max number of features to plot :return: None """ fig, ax = plt.subplots(figsize=(15, 15)) lgb.plot_importance(model_, ax=ax, max_num_features=num_features) fig.savefig(dir_path + 'plots/feature_importance_booster.png') if __name__ == '__main__': args = parser.parse_args() path_to_dir = args.dir_path if args.dir_path[ -1] == '/' else args.dir_path + '/' # загрузка сохраненой модели model = lgb.Booster(model_file=path_to_dir + 'model/booster.txt') # загрузка тестовой выборки Xtest, ytest = load_data(path_to_dir + 'data_split/test.csv') # формирование вектора предсказаний predictions = predict_(model, Xtest) # построение матрицы несоответствий и графика важности признаков test_results(ytest, predictions, path_to_dir) plot_feature_importance(model, args.num_features_importance, path_to_dir)
def predict_on_iter(self, batch, path_save, l_image_id=None, un_rescale=True): """given an iter load the model at this iteration, returns the a predicted_batch but check if image have been saved at this directory :param dataset: :param batch could be a string : path to the dataset or an array corresponding to the batch we are going to predict on """ if type(batch) == type( "u" ): # the param is an string we load the bathc from this directory #print("We load our data from {}".format(batch)) l_image_id = find_image_indir(batch + XDIR, "npy") batch, _ = load_data(batch, x_shape=self.model_yaml["input_shape"], label_shape=self.model_yaml["dim_gt_image"], normalization=self.normalization, dict_band_X=self.dict_band_X, dict_band_label=self.dict_band_label, dict_rescale_type=self.dict_rescale_type, dict_scale=self.scale_dict_train, fact_s2=self.fact_s2, fact_s1=self.fact_s1, s2_bands=self.s2bands, s1_bands=self.s1bands, clip_s2=False) else: if l_image_id is None: print("We defined our own index for image name") l_image_id = [i for i in range(batch.shape[0])] assert len(l_image_id) == batch.shape[ 0], "Wrong size of the name of the images is {} should be {} ".format( len(l_image_id), batch.shape[0]) if os.path.isdir(path_save): print( "[INFO] the directory where to store the image already exists") data_array, path_tile, _ = load_from_dir( path_save, self.model_yaml["dim_gt_image"]) return data_array else: create_safe_directory(path_save) batch_res = self.generator.predict(batch) # if un_rescale: # remove the normalization made on the data # _, batch_res, _ = rescale_array(batch, batch_res, dict_group_band_X=self.dict_band_X, # dict_group_band_label=self.dict_band_label, # dict_rescale_type=self.dict_rescale_type, # dict_scale=self.scale_dict_train, invert=True, fact_scale2=self.fact_s2, # fact_scale1=self.fact_s1,clip_s2=False) assert batch_res.shape[0] == batch.shape[ 0], "Wrong prediction should have shape {} but has shape {}".format( batch_res.shape, batch.shape) if path_save is not None: # we store the data at path_save for i in range(batch_res.shape[0]): np.save( "{}_image_{}".format(path_save, l_image_id[i].split("/")[-1]), batch_res[i, :, :, :]) return batch_res
def __init__(self, model_yaml, train_yaml): """ Args: model_yaml: dictionnary with the model parameters train_yaml: dictionnary the tran parameters """ self.sigma_val = 0 self.model_yaml = model_yaml self.img_rows = 28 self.img_cols = 28 self.channels = 1 self.img_shape = (self.img_rows, self.img_cols, self.channels) if "dict_band_x" not in train_yaml: self.dict_band_X = None self.dict_band_label = None self.dict_rescale_type = None else: self.dict_band_X = train_yaml["dict_band_x"] self.dict_band_label = train_yaml["dict_band_label"] self.dict_rescale_type = train_yaml["dict_rescale_type"] self.s1bands = train_yaml["s1bands"] self.s2bands = train_yaml["s2bands"] # self.latent_dim = 100 # PATH self.model_name = model_yaml["model_name"] self.model_dir = train_yaml["training_dir"] + self.model_name + "/" self.this_training_dir = self.model_dir + "training_{}/".format( train_yaml["training_number"]) self.saving_image_path = self.this_training_dir + "saved_training_images/" self.saving_logs_path = self.this_training_dir + "logs/" self.checkpoint_dir = self.this_training_dir + "checkpoints/" self.previous_checkpoint = train_yaml["load_model"] # TRAIN PARAMETER self.normalization = train_yaml["normalization"] self.epoch = train_yaml["epoch"] self.batch_size = train_yaml["batch_size"] # self.sess = sess self.learning_rate = train_yaml["lr"] self.fact_g_lr = train_yaml["fact_g_lr"] self.beta1 = train_yaml["beta1"] self.val_directory = train_yaml["val_directory"] self.fact_s2 = train_yaml["s2_scale"] self.fact_s1 = train_yaml["s1_scale"] self.data_X, self.data_y, self.scale_dict_train = load_data( train_yaml["train_directory"], x_shape=model_yaml["input_shape"], label_shape=model_yaml["dim_gt_image"], normalization=self.normalization, dict_band_X=self.dict_band_X, dict_band_label=self.dict_band_label, dict_rescale_type=self.dict_rescale_type, fact_s2=self.fact_s2, fact_s1=self.fact_s1, s2_bands=self.s2bands, s1_bands=self.s1bands, lim=train_yaml["lim_train_tile"]) self.val_X, self.val_Y, scale_dict_val = load_data( self.val_directory, x_shape=model_yaml["input_shape"], label_shape=model_yaml["dim_gt_image"], normalization=self.normalization, dict_band_X=self.dict_band_X, dict_band_label=self.dict_band_label, dict_rescale_type=self.dict_rescale_type, dict_scale=self.scale_dict_train, fact_s2=self.fact_s2, fact_s1=self.fact_s1, s2_bands=self.s2bands, s1_bands=self.s1bands, lim=train_yaml["lim_val_tile"]) print("Loading the data done dataX {} dataY {}".format( self.data_X.shape, self.data_y.shape)) self.gpu = train_yaml["n_gpu"] self.num_batches = self.data_X.shape[0] // self.batch_size self.model_yaml = model_yaml self.im_saving_step = train_yaml["im_saving_step"] self.w_saving_step = train_yaml["weights_saving_step"] self.val_metric_step = train_yaml["metric_step"] # REDUCE THE DISCRIMINATOR PERFORMANCE self.val_lambda = train_yaml["lambda"] self.real_label_smoothing = tuple(train_yaml["real_label_smoothing"]) self.fake_label_smoothing = tuple(train_yaml["fake_label_smoothing"]) self.sigma_init = train_yaml["sigma_init"] self.sigma_step = train_yaml['sigma_step'] self.sigma_decay = train_yaml["sigma_decay"] self.ite_train_g = train_yaml["train_g_multiple_time"] self.max_im = 10 self.strategy = tf.distribute.MirroredStrategy() print('Number of devices: {}'.format( self.strategy.num_replicas_in_sync)) self.buffer_size = self.data_X.shape[0] self.global_batch_size = self.batch_size * self.strategy.num_replicas_in_sync with self.strategy.scope(): self.d_optimizer = Adam(self.learning_rate, self.beta1) self.g_optimizer = Adam(self.learning_rate * self.fact_g_lr, self.beta1) self.build_model() self.model_writer = tf.summary.create_file_writer( self.saving_logs_path)
import sys sys.path.append("..") from utils.load_dataset import load_data from utils.audio_process import get_max_audio_length from utils.text_process import get_process_text_list, get_max_label_length, tokenize if __name__ == "__main__": configs = get_config() dataset_name = configs["preprocess"]["dataset_name"] data_path = configs["train"]["data_path"] text_row_style = configs["preprocess"]["text_row_style"] num_examples = configs["train"]["num_examples"] # 获取语料里所有语音路径list和文本list audio_data_path_list, text_list = load_data(dataset_name, data_path, num_examples) # 基于文本按照某种mode切分文本 mode = configs["preprocess"]["text_process_mode"] process_text_list = get_process_text_list(text_list, mode) # 将文本处理成对应的token数字序列 text_int_sequences, tokenizer = tokenize(process_text_list) # 获取音频和文本的最大length,从而进行数据补齐 audio_feature_type = configs["other"]["audio_feature_type"] max_input_length = get_max_audio_length(audio_data_path_list, audio_feature_type) max_label_length = get_max_label_length(text_int_sequences) # 将数据集的相关信息写入dataset_information.json文件
# 加载模型检查点 checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) manager = tf.train.CheckpointManager( checkpoint, directory=configs["checkpoint"]['directory'], max_to_keep=configs["checkpoint"]['max_to_keep']) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) dataset_name = configs["preprocess"]["dataset_name"] test_data_path = configs["test"]["data_path"] num_examples = configs["test"]["num_examples"] # 加载测试集数据(audio_data_path_list, text_list) test_data = load_data(dataset_name, test_data_path, num_examples) batch_size = configs["test"]["batch_size"] batchs = ceil(len(test_data[0]) / batch_size) audio_feature_type = configs["other"]["audio_feature_type"] max_input_length = dataset_information["max_input_length"] # 构建测试数据生成器 test_data_generator = test_generator(test_data, batchs, batch_size, audio_feature_type, max_input_length) # 获取index_word index_word = dataset_information["index_word"] text_process_mode = configs["preprocess"]["text_process_mode"] # 计算指标并打印