def train(ae_type, latent_dim=2, epochs=100, lr=1e-4, batch_size=1000): if ae_type == "AE" or ae_type == "DAE": model = AE(latent_dim) elif ae_type == "VAE": model = VAE(latent_dim) elif ae_type == "CVAE": model = CVAE(latent_dim) elif ae_type == "BetaVAE": model = BetaVAE(latent_dim) else: raise ValueError # load train and test data train_dataset, test_dataset = data.load_dataset(ae_type, batch_size=batch_size) # initialize Adam optimizer optimizer = tf.keras.optimizers.Adam(lr) for epoch in range(1, epochs + 1): last_loss = 0 for train_x, train_y in train_dataset: gradients, loss = compute_gradients(model, train_x, train_y, ae_type) apply_gradients(optimizer, gradients, model.trainable_variables) last_loss = loss if epoch % 2 == 0: print('Epoch {}, Loss: {}'.format(epoch, last_loss)) return model
def train_autoencoder(X_dir, Y_dir, batch_size, dim, X_channels, Y_channels, log_dir, shuffle, **kwargs): # Dataset pairs_filename = load_dataset(X_dir, Y_dir) partition = partition_dataset(pairs_filename) # Generators training_generator = DataGenerator(partition['train'], batch_size, dim, X_channels, Y_channels, shuffle) validation_generator = DataGenerator(partition['validation'], batch_size, dim, X_channels, Y_channels, shuffle) # Design model input_img = Input(shape=(*dim, X_channels)) encoder_img = encoder(n_features=8) decoder_lbl = decoder(n_output_features=Y_channels, n_features=8) latent_img = encoder_img(input_img) latent_lbl = latent_img # TODO Put res_net here for image to label translation restored_lbl = decoder_lbl(latent_lbl) img2lbl = Model(input_img, restored_lbl) img2lbl.compile(optimizer='adadelta', loss='mean_squared_error') # Print summary img2lbl.summary() print('Model contains a total of %d trainable layers.\n' % len(img2lbl.trainable_weights)) # Train model tbi_callback = TensorBoardImage(log_dir=log_dir, validation_data=validation_generator) tb_callback = TensorBoard(log_dir=log_dir) img2lbl.fit_generator(generator=training_generator, validation_data=validation_generator, epochs=50, callbacks=[tb_callback, tbi_callback], use_multiprocessing=True, workers=2)
def train_model(max_items=-1): """train the classifier with up to the given amount of items. Returns the feature extractor and the trained classifier""" extractor, (X_train, Y_train), (X_test, Y_test) = data.load_dataset("games.json", max_items=max_items) classifier = classifiers.RandomForest() classifier.train(X_train, Y_train) test_model(classifier, X_test, Y_test) return extractor, classifier
def evaluate_file(sess, args, eval_model, vocab, files, batch_size=100, print_logs=False): eval_iterator = load_dataset(files, vocab, constants.EVAL, batch_size=batch_size, min_seq_len=args.min_seq_len, max_seq_len=args.max_seq_len, has_source=True) eval_next_op = eval_iterator.get_next() sess.run([eval_iterator.initializer]) n_batch = 0 t0 = time.time() losses = [] while True: try: data = sess.run(eval_next_op) # get real data!! feed_dict = { eval_model.x: data['ids'], eval_model.y: data['senti'], eval_model.sequence_length: data['length'] } ops = [eval_model.loss] res = sess.run(ops, feed_dict=feed_dict) losses.append(res[0]) n_batch += 1 except tf.errors.OutOfRangeError as e: # next epoch if print_logs: print("Test---Total N batch:{}\tCost time:{}".format( n_batch, time.time() - t0)) break del eval_iterator del eval_next_op return np.mean(losses)
def run_experiment(epochs, model_name, training_type, configs): """ Runs the basic experiment""" print(epochs, "CONFIGS: ", configs) # set seed for reproducibility. seed = configs.seed random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # gpu training specific seed settings. torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # load data loaders = load_dataset(configs) # load model model = load_model(model_name, training_type, configs) # loss criterion = nn.CrossEntropyLoss().cuda() # optimizer # optimizer = optim.SGD(model.parameters(), configs.lr, # momentum=configs.momentum, # weight_decay=configs.weight_decay) optimizer = optim.Adam(model.parameters(), configs.lr) # get tracking dictionaries model_weights, layer_dict = setup_delta_tracking(model) # train model rmae_delta_dict, train_acc_arr, test_acc_arr = training( epochs, loaders, model, optimizer, criterion, model_weights, layer_dict, configs) return rmae_delta_dict, train_acc_arr, test_acc_arr
with tf.device( "/cpu:0"): # Input pipeline should always be placed on the CPU. print("Use x'->y to update model f(x->y)") train_iterator = load_paired_dataset(args.tsf_train_data[B], args.train_data[B], src_vocab, tgt_vocab, batch_size=args.batch_size) dev_iterator = load_paired_dataset(args.tsf_dev_data[B], args.dev_data[B], src_vocab, tgt_vocab, batch_size=args.batch_size) src_test_iterator = load_dataset(args.test_data[A], src_vocab, mode=constants.INFER) train_next_op = train_iterator.get_next() dev_next_op = dev_iterator.get_next() src_test_next_op = src_test_iterator.get_next() # === Create session tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session(config=tf_config) # === Train if args.mode == "train": # Prepare for model saver
def main(): # === Load arguments args = load_dual_arguments() dump_args_to_yaml(args, args.final_model_save_dir) cls_args = load_args_from_yaml(args.cls_model_save_dir) nmt_args = load_args_from_yaml(os.path.join(args.nmt_model_save_dir, '0-1')) nmt_args.learning_rate = args.learning_rate # a smaller learning rate for RL min_seq_len = min(int(max(re.findall("\d", cls_args.filter_sizes))), args.min_seq_len) # === Load global vocab word2id, word2id_size = load_vocab_dict(args.global_vocab_file) global_vocab, global_vocab_size = load_vocab(args.global_vocab_file) print("Global_vocab_size: %s" % global_vocab_size) global_vocab_rev = tf.contrib.lookup.index_to_string_table_from_file( args.global_vocab_file, vocab_size=global_vocab_size - constants.NUM_OOV_BUCKETS, default_value=constants.UNKNOWN_TOKEN) src_vocab = tgt_vocab = global_vocab src_vocab_size = tgt_vocab_size = global_vocab_size src_vocab_rev = tgt_vocab_rev = global_vocab_rev # === Create session tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.per_process_gpu_memory_fraction = 0.3 sess = tf.Session(config=tf_config) # === Initial and build model cls = cls_create_model(sess, cls_args, global_vocab_size, mode=constants.EVAL, load_pretrained_model=True) nmts_train = [] nmts_random_infer = [] nmts_greedy_infer = [] train_data_next = [] dev_data_next = [] test_data_next = [] train_iterators = [] test_iterators = [] paired_train_iterators = [] paired_train_data_next = [] final_model_save_paths = [] # === Define nmt model for A, B in [(0, 1), (1, 0)]: with tf.device("/cpu:0" ): # Input pipeline should always be placed on the CPU. src_train_iterator = load_dataset(args.train_data[A], src_vocab, mode=constants.TRAIN, batch_size=args.batch_size, min_seq_len=min_seq_len) src_dev_iterator = load_dataset(args.dev_data[A], src_vocab, mode=constants.EVAL, batch_size=500) src_test_iterator = load_dataset(args.test_data[A], src_vocab, mode=constants.EVAL, batch_size=500) # Use (X', Y) to produce pseudo parallel data paired_src_train_iterator = load_paired_dataset( args.tsf_train_data[B], args.train_data[B], src_vocab, tgt_vocab, batch_size=args.batch_size, min_seq_len=min_seq_len) src_train_next_op = src_train_iterator.get_next( ) # To avoid frequent calls of `Iterator.get_next()` src_dev_next_op = src_dev_iterator.get_next() src_test_next_op = src_test_iterator.get_next() src_paired_train_next_op = paired_src_train_iterator.get_next() train_data_next.append(src_train_next_op) dev_data_next.append(src_dev_next_op) test_data_next.append(src_test_next_op) paired_train_data_next.append(src_paired_train_next_op) train_iterators.append(src_train_iterator) test_iterators.append(src_test_iterator) paired_train_iterators.append(paired_src_train_iterator) direction = "%s-%s" % (A, B) nmt_args.sampling_probability = 0.5 # == Define train model nmt_train = nmt_create_model(sess, nmt_args, src_vocab_size, tgt_vocab_size, src_vocab_rev, tgt_vocab_rev, mode=constants.TRAIN, direction=direction, load_pretrained_model=True) # == Define inference model decode_type_before = nmt_args.decode_type nmt_args.decode_type = constants.RANDOM nmt_random_infer = nmt_create_model(sess, nmt_args, src_vocab_size, tgt_vocab_size, src_vocab_rev, tgt_vocab_rev, mode=constants.INFER, direction=direction, reuse=True) nmt_args.decode_type = constants.GREEDY nmt_greedy_infer = nmt_create_model(sess, nmt_args, src_vocab_size, tgt_vocab_size, src_vocab_rev, tgt_vocab_rev, mode=constants.INFER, direction=direction, reuse=True) nmt_args.decode_type = decode_type_before # restore to previous setting nmts_train.append(nmt_train) nmts_random_infer.append(nmt_random_infer) nmts_greedy_infer.append(nmt_greedy_infer) # == Prepare for model saver print("Prepare for model saver") final_model_save_path = "%s/%s-%s/" % (args.final_model_save_dir, A, B) if not os.path.exists(final_model_save_path): os.makedirs(final_model_save_path) print("Model save path:", final_model_save_path) final_model_save_paths.append(final_model_save_path) # === Start train n_batch = -1 global_step = -1 A = 1 B = 0 G_scores = [] for i in range(args.n_epoch): print("Epoch:%s" % i) sess.run([train_iterators[A].initializer]) sess.run([train_iterators[B].initializer]) sess.run([paired_train_iterators[A].initializer]) sess.run([paired_train_iterators[B].initializer]) while True: n_batch += 1 global_step += 1 if n_batch % args.eval_step == 0: print( '===== Start (N_batch: %s, Steps: %s): Evaluate on test datasets ===== ' % (n_batch, global_step)) _, dst_f_A = inference(nmts_greedy_infer[A], sess=sess, args=nmt_args, A=A, B=B, src_test_iterator=test_iterators[A], src_test_next=test_data_next[A], src_vocab_rev=src_vocab_rev, result_dir=args.final_tsf_result_dir, step=global_step) _, dst_f_B = inference(nmts_greedy_infer[B], sess=sess, args=nmt_args, A=B, B=A, src_test_iterator=test_iterators[B], src_test_next=test_data_next[B], src_vocab_rev=src_vocab_rev, result_dir=args.final_tsf_result_dir, step=global_step) t0 = time.time() # calculate accuracy score senti_acc = cls_evaluate_file(sess, cls_args, word2id, cls, [dst_f_A, dst_f_B], index_list=[B, A]) # calculate bleu score bleu_score_A = bleu_evaluator.score(args.reference[A], dst_f_A) bleu_score_B = bleu_evaluator.score(args.reference[B], dst_f_B) bleu_score = (bleu_score_A + bleu_score_B) / 2 G_score = np.sqrt(senti_acc * bleu_score) H_score = 2 / (1 / senti_acc + 1 / bleu_score) G_scores.append(G_score) print( "%s-%s_Test(Batch:%d)\tSenti:%.3f\tBLEU(4ref):%.3f(A:%.3f+B:%.3f)" "\tG-score:%.3f\tH-score:%.3f\tCost time:%.2f" % (A, B, n_batch, senti_acc, bleu_score, bleu_score_A, bleu_score_B, G_score, H_score, time.time() - t0)) print( '===== End (N_batch: %s, Steps: %s): Evaluate on test datasets ====== ' % (n_batch, global_step)) if n_batch % args.save_per_step == 0: print("=== Save model at dir:", final_model_save_paths[A], final_model_save_paths[B]) nmts_train[A].saver.save(sess, final_model_save_paths[A], global_step=global_step) nmts_train[B].saver.save(sess, final_model_save_paths[B], global_step=global_step) if n_batch % args.change_per_step == 0: A, B = B, A print( "============= Change to train model {}-{} at {} steps ==============" .format(A, B, global_step)) try: t0 = time.time() src = sess.run(train_data_next[A]) # get real data!! batch_size = np.shape(src["ids"])[0] decode_width = nmt_args.decode_width tile_src_ids = np.repeat(src["ids"], decode_width, axis=0) # [batch_size*sample_size], tile_src_length = np.repeat(src['length'], decode_width, axis=0) tile_src_ids_in = np.repeat(src["ids_in"], decode_width, axis=0) tile_src_ids_out = np.repeat(src["ids_out"], decode_width, axis=0) tile_src_ids_in_out = np.repeat(src["ids_in_out"], decode_width, axis=0) random_predictions = sess.run( nmts_random_infer[A].predictions, feed_dict={ nmts_random_infer[A].input_ids: src['ids'], nmts_random_infer[A].input_length: src['length'] }) assert np.shape( random_predictions["ids"])[1] == nmt_args.decode_width mid_ids_log_prob = np.reshape(random_predictions["log_probs"], -1) mid_ids, mid_ids_in, mid_ids_out, mid_ids_in_out, mid_ids_length = \ process_mid_ids(random_predictions["ids"], random_predictions["length"], min_seq_len, global_vocab_size) greedy_predictions = sess.run( nmts_greedy_infer[A].predictions, feed_dict={ nmts_greedy_infer[A].input_ids: src['ids'], nmts_greedy_infer[A].input_length: src['length'] }) assert np.shape(greedy_predictions["ids"])[1] == 1 mid_ids_bs, mid_ids_in_bs, mid_ids_out_bs, mid_ids_in_out_bs, mid_ids_length_bs = \ process_mid_ids(greedy_predictions["ids"], greedy_predictions["length"], min_seq_len, global_vocab_size) # Get style reward from classifier cls_probs = sess.run(cls.probs, feed_dict={ cls.x: mid_ids, cls.dropout: 1 }) y_hat = [p > 0.5 for p in cls_probs] # 1 or 0 cls_acu = [p == B for p in y_hat ] # accuracy: count the number of style B style_reward = np.array(cls_acu, dtype=np.float32) # Get content reward from backward reconstruction feed_dict = { nmts_train[B].input_ids: mid_ids, nmts_train[B].input_length: mid_ids_length, nmts_train[B].target_ids_in: tile_src_ids_in, nmts_train[B].target_ids_out: tile_src_ids_out, nmts_train[B].target_length: tile_src_length } nmtB_loss = sess.run( nmts_train[B].loss_per_sequence, feed_dict=feed_dict) # nmtB_loss = -log(prob) nmtB_reward = nmtB_loss * ( -1) # reward = log(prob) ==> bigger is better # Get baseline reward from backward reconstruction feed_dict = { nmts_train[B].input_ids: mid_ids_bs, nmts_train[B].input_length: mid_ids_length_bs, nmts_train[B].target_ids_in: src["ids_in"], nmts_train[B].target_ids_out: src["ids_out"], nmts_train[B].target_length: src["length"] } nmtB_loss_bs = sess.run(nmts_train[B].loss_per_sequence, feed_dict=feed_dict) nmtB_reward_bs = nmtB_loss_bs * (-1) # nmt baseline reward def norm(x): x = np.array(x) x = (x - x.mean()) / (x.std() + safe_divide_constant) # x = x - x.min() # to make sure > 0 return x def sigmoid(x, x_trans=0.0, x_scale=1.0, max_y=1, do_norm=False): value = max_y / (1 + np.exp(-(x - x_trans) * x_scale)) if do_norm: value = norm(value) return value def norm_nmt_reward(x, baseline=None, scale=False): x = np.reshape(x, (batch_size, -1)) # x is in [-16, 0] dim1 = np.shape(x)[1] if baseline is not None: x_baseline = baseline # [batch_size] else: x_baseline = np.mean(x, axis=1) # [batch_size] x_baseline = np.repeat(x_baseline, dim1) # [batch_size*dim1] x_baseline = np.reshape(x_baseline, (batch_size, dim1)) x_norm = x - x_baseline if scale: # x_norm = sigmoid(x_norm, x_scale=0.5) # x_norm: [-12, 12] => [0, 1] x_norm = sigmoid( x_norm ) # Sharper normalization, x_norm: [-6, 6] => [0, 1] return x_norm.reshape(-1) if args.use_baseline: content_reward = norm_nmt_reward(nmtB_reward, baseline=nmtB_reward_bs, scale=True) else: content_reward = norm_nmt_reward(nmtB_reward, scale=True) # Calculate reward style_reward += safe_divide_constant content_reward += safe_divide_constant reward = (1 + 0.25) * style_reward * content_reward / ( style_reward + 0.25 * content_reward) if args.normalize_reward: reward = norm(reward) # == Update nmtA via policy gradient training feed_dict = { nmts_train[A].input_ids: tile_src_ids, nmts_train[A].input_length: tile_src_length, nmts_train[A].target_ids_in: mid_ids_in, nmts_train[A].target_ids_out: mid_ids_out, nmts_train[A].target_length: mid_ids_length, nmts_train[A].reward: reward } ops = [ nmts_train[A].lr_loss, nmts_train[A].loss, nmts_train[A].loss_per_sequence, nmts_train[A].retrain_op ] nmtA_loss_final, nmtA_loss_, loss_per_sequence_, _ = sess.run( ops, feed_dict=feed_dict) # == Update nmtA with pseudo data if args.MLE_anneal: gap = min( args.anneal_max_gap, int(args.anneal_initial_gap * np.power(args.anneal_rate, global_step / args.anneal_steps))) else: gap = args.anneal_initial_gap if n_batch % gap == 0: # Update nmtA using original pseudo data (used as pre-training) # This is not a ideal way since the quality of the pseudo-parallel data is not acceptable for # the later iterations of training. # We highly recommend you adopt back translation to generate the pseudo-parallel data on-the-fly if "pseudo" in args.teacher_forcing: data = sess.run( paired_train_data_next[A]) # get real data!! feed_dict = { nmts_train[A].input_ids: data["ids"], nmts_train[A].input_length: data["length"], nmts_train[A].target_ids_in: data["trans_ids_in"], nmts_train[A].target_ids_out: data["trans_ids_out"], nmts_train[A].target_length: data["trans_length"], } nmtA_pse_loss_, _ = sess.run( [nmts_train[A].loss, nmts_train[A].train_op], feed_dict=feed_dict) # Update nmtB using pseudo data generated via back_translation (on-the-fly) if "back_trans" in args.teacher_forcing: feed_dict = { nmts_train[B].input_ids: mid_ids_bs, nmts_train[B].input_length: mid_ids_length_bs, nmts_train[B].target_ids_in: src["ids_in"], nmts_train[B].target_ids_out: src["ids_out"], nmts_train[B].target_length: src["length"], } nmtB_loss_, _ = sess.run( [nmts_train[B].loss, nmts_train[B].train_op], feed_dict=feed_dict) except tf.errors.OutOfRangeError as e: # next epoch print("===== DualTrain: Total N batch:{}\tCost time:{} =====". format(n_batch, time.time() - t0)) n_batch = -1 break
global_data = np.load(GLOBAL_DATA_FILE_NAME).astype(np.float32) # graph loading max_neighborhood_size = 0 rag_file = open(RAG_FILE_NAME, 'rb') rag = pickle.load(rag_file) rag_file.close() nodes = rag.nodes() for id in nodes: max_neighborhood_size = max(len(list(rag.neighbors(id))), max_neighborhood_size) fold = int(sys.argv[1]) print("Fold numero: ", fold) x_train, x_validation, x_test, y_train, y_validation, y_test, id_train, id_validation, id_test = load_dataset( str(fold)) model = STARCANE(units=512, dropout_rate=0.4, n_classes=N_CLASSES) train_dims = get_neighborhood_sizes(id_train, rag) valid_dims = get_neighborhood_sizes(id_validation, rag) test_dims = get_neighborhood_sizes(id_test, rag) loss_object = tf.keras.losses.CategoricalCrossentropy() optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001) ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(ckpt, OUTPUT_FOLDER + "_" + str(fold),
# Enable CuDNN optimization torch.backends.cudnn.benchmark=True # Handling cuda args.cuda = not args.device == 'cpu' and torch.cuda.is_available() args.device = torch.device(args.device if torch.cuda.is_available() else 'cpu') print('Optimization will be on ' + str(args.device) + '.') """ ################### Basic definitions ################### """ print('[Loading dataset]') ref_split = args.path + '/reference_split_' + args.dataset+ "_" +args.data + '.npz' if (args.train_type == 'random' or (not os.path.exists(ref_split))): train_loader, valid_loader, test_loader, args = load_dataset(args) # Take fixed batch fixed_data, fixed_params, fixed_meta, fixed_audio = next(iter(test_loader)) fixed_data, fixed_params, fixed_meta, fixed_audio = fixed_data.to(args.device), fixed_params.to(args.device), fixed_meta, fixed_audio fixed_batch = (fixed_data, fixed_params, fixed_meta, fixed_audio) if (args.train_type == 'fixed'): np.savez(ref_split, [train_loader, valid_loader, test_loader]) else: data = np.load(ref_split)['arr_0'] train_loader, valid_loader, test_loader = data[0], data[1], data[2] fixed_data, fixed_params, fixed_meta, fixed_audio = next(iter(test_loader)) fixed_data, fixed_params, fixed_meta, fixed_audio = fixed_data.to(args.device), fixed_params.to(args.device), fixed_meta, fixed_audio fixed_batch = (fixed_data, fixed_params, fixed_meta, fixed_audio) args.output_size = train_loader.dataset.output_size args.input_size = train_loader.dataset.input_size
return pred if __name__ == "__main__": # config blend_list = [['20200812-125739_bert.csv', 0.35], ['20200814-131828_bert.csv', 0.05], ['20200813-210634_nn_word2vec.csv', 0.2], ['20200826-205330_nn_glove.csv', 0.05], ['20200826-202548_nn_word2vec.csv', 0.05], ['20200810-203854_lgb.csv', 0.25], ['20200825-211210_lgb_tfidf.csv', 0.05]] # prepare submit _, test_df, _ = load_dataset() submit = pd.DataFrame([]) submit['id'] = test_df['id'] submit[1] = 0 submit[2] = 0 submit[3] = 0 submit[4] = 0 # combine for filename, weight in blend_list: filepath = os.path.join(SUBMITS_DIR, filename) sub = pd.read_csv(filepath, names=('id', 'pred')) sub = add_onehot(sub) for i in range(1, 5): submit[i] += sub[i] * weight
with tf.device( "/cpu:0"): # Input pipeline should always be place on the CPU. print("args.pseudo_data:", args.pseudo_data) if args.mode == "train": train_iterator = load_paired_dataset(args.pseudo_data, vocab, batch_size=args.batch_size, min_seq_len=args.min_seq_len, max_seq_len=args.max_seq_len) train_next_op = train_iterator.get_next() else: src_test_iterator = load_dataset(args.test_data, vocab, mode=constants.INFER, min_seq_len=args.min_seq_len, max_seq_len=args.max_seq_len) src_test_next_op = src_test_iterator.get_next() # Step 2: create session tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session( config=tf_config ) # limit gpu memory; don"t pre-allocate memory; allocate as-needed # Step 3: train model if args.mode == "train": # Prepare for model saver
build_vocab_from_file(args.train_data, args.vocab_file) vocab, vocab_size = load_vocab(args.vocab_file) print('Vocabulary size:%s' % vocab_size) vocab_rev = tf.contrib.lookup.index_to_string_table_from_file( args.vocab_file, # target vocabulary file(each lines has a word) vocab_size=vocab_size - constants.NUM_OOV_BUCKETS, default_value=constants.UNKNOWN_TOKEN) with tf.device( "/cpu:0"): # Input pipeline should always be place on the CPU. if args.mode == constants.TRAIN: train_data_iterator = load_dataset(args.train_data, vocab, constants.TRAIN, batch_size=args.batch_size, min_seq_len=args.min_seq_len, max_seq_len=args.max_seq_len) train_data_next_op = train_data_iterator.get_next() dev_data_iterator = load_dataset(args.dev_data, vocab, constants.EVAL, batch_size=100, min_seq_len=args.min_seq_len, max_seq_len=args.max_seq_len) dev_data_next_op = dev_data_iterator.get_next() test_data_iterator = load_dataset(args.test_data, vocab, constants.TEST,
def main(): args = load_cycle_arguments() dump_args_to_yaml(args, args.final_model_save_dir) print(args) reg_args = load_args_from_yaml(args.reg_model_save_dir) s2ss_args = load_args_from_yaml(args.s2ss_model_save_dir) # s2ss_args.seq2seq_model_save_dir = args.seq2seq_model_save_dir s2ss_args.RL_learning_rate = args.RL_learning_rate # a smaller learning_rate for RL s2ss_args.MLE_learning_rate = args.MLE_learning_rate # a smaller learning_rate for MLE s2ss_args.batch_size = args.batch_size # a bigger batch_size for RL min_seq_len = args.min_seq_len max_seq_len = args.max_seq_len # === Load global vocab vocab, vocab_size = load_vocab(args.vocab_file) print("Vocabulary size: %s" % vocab_size) vocab_rev = tf.contrib.lookup.index_to_string_table_from_file( args.vocab_file, # target vocabulary file(each lines has a word) vocab_size=vocab_size - constants.NUM_OOV_BUCKETS, default_value=constants.UNKNOWN_TOKEN) bleu_evaluator = BLEUEvaluator() # === Create session tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True tf_config.gpu_options.per_process_gpu_memory_fraction = 0.4 sess = tf.Session( config=tf_config ) # limit gpu memory; don't pre-allocate memory; allocate as-needed # === Load dataset with tf.device( "/cpu:0"): # Input pipeline should always be place on the CPU. train_data_iterator = load_dataset(args.train_data, vocab, mode=constants.TRAIN, batch_size=args.batch_size, min_seq_len=min_seq_len, max_seq_len=max_seq_len) dev_data_iterator = load_dataset(args.dev_data, vocab, mode=constants.EVAL, batch_size=100, min_seq_len=min_seq_len, max_seq_len=max_seq_len) test_data_iterator = load_dataset(args.test_data, vocab, mode=constants.TEST, batch_size=100, min_seq_len=min_seq_len, max_seq_len=max_seq_len) paired_train_data_iterator = load_paired_dataset( args.pseudo_data, vocab, batch_size=args.batch_size, min_seq_len=min_seq_len, max_seq_len=max_seq_len) train_data_next = train_data_iterator.get_next( ) # to avoid high number of `Iterator.get_next()` calls dev_data_next = dev_data_iterator.get_next() test_data_next = test_data_iterator.get_next() paired_train_data_next = paired_train_data_iterator.get_next() # === Initialize and build Seq2SentiSeq model load_model = False if args.no_pretrain else True s2ss_train = s2ss_create_model(sess, s2ss_args, constants.TRAIN, vocab_size, load_pretrained_model=load_model) decode_type_before = s2ss_args.decode_type s2ss_args.decode_type = constants.GREEDY s2ss_greedy_infer = s2ss_create_model(sess, s2ss_args, constants.INFER, vocab_size, reuse=True) s2ss_args.decode_type = constants.RANDOM s2ss_random_infer = s2ss_create_model(sess, s2ss_args, constants.INFER, vocab_size, reuse=True) s2ss_args.decode_type = decode_type_before # === Load pre-trained sentiment regression model eval_reg = reg_create_model(sess, reg_args, vocab_size, mode=constants.EVAL, load_pretrained_model=True) print("Prepare for model saver") final_model_save_path = args.final_model_save_dir # === Start train n_batch = -1 global_step = -1 for i in range(args.n_epoch): print("Epoch:%s" % i) sess.run([train_data_iterator.initializer]) sess.run([paired_train_data_iterator.initializer]) senti_reward_all = { # reward to measure the sentiment transformation of generated sequence "upper": [], # reward of ground truth (existed sequence in train dataset) "lower": [], # reward of baseline: random generated sequence "real": [], # reward of real generated sequence } cont_reward_all = { # reward to measure the content preservation of generated sequence "upper": [], # reward of ground truth (existed sequence in train dataset) "lower": [], # reward of baseline: random generated sequence "real": [], # reward of real generated sequence } reward_all = [] reward_expect_all = [] # reward expectation: r*p(y_k|x) while True: n_batch += 1 global_step += 1 if n_batch % args.eval_step == 0: print( '\n================ N_batch / Global_step (%s / %s): Evaluate on test datasets ================\n' % (n_batch, global_step)) dst_fs = inference( s2ss_greedy_infer, sess=sess, args=s2ss_args, decoder_s=constants.SENT_LIST, src_test_iterator=test_data_iterator, src_test_next=test_data_next, vocab_rev=vocab_rev, result_dir=args.final_tsf_result_dir, step=global_step if args.save_each_step else global_step) t0 = time.time() bleu_scores = bleu_evaluator.score(args.reference, dst_fs[1], all_bleu=True) print( "Test(Batch:%d)\tBLEU-1:%.3f\tBLEU-2:%.3f\tBLEU:%.3f\tCost time:%.2f" % (n_batch, bleu_scores[1], bleu_scores[2], bleu_scores[0], time.time() - t0)) # improve the diversity of generated sentences dst_fs = inference( s2ss_random_infer, sess=sess, args=s2ss_args, decoder_s=constants.SENT_LIST, src_test_iterator=test_data_iterator, src_test_next=test_data_next, vocab_rev=vocab_rev, result_dir=args.final_tsf_result_dir + '-sample', step=global_step if args.save_each_step else global_step) t0 = time.time() bleu_scores = bleu_evaluator.score(args.reference, dst_fs[1], all_bleu=True) print( "Test(Batch:%d)\tBLEU-1:%.3f\tBLEU-2:%.3f\tBLEU:%.3f\tCost time:%.2f ===> Sampled results" % (n_batch, bleu_scores[1], bleu_scores[2], bleu_scores[0], time.time() - t0)) if n_batch % args.save_per_step == 0: print("Save model at dir:", final_model_save_path) s2ss_train.saver.save(sess, final_model_save_path, global_step=n_batch) try: t0 = time.time() src = sess.run(train_data_next) # get real data!! batch_size = np.shape(src["ids"])[0] decode_width = s2ss_args.decode_width t0 = time.time() tile_src_ids = np.repeat(src["ids"], decode_width, axis=0) # [batch_size*beam_size], tile_src_length = np.repeat(src['length'], decode_width, axis=0) tile_src_ids_in = np.repeat(src["ids_in"], decode_width, axis=0) tile_src_ids_out = np.repeat(src["ids_out"], decode_width, axis=0) tile_src_ids_in_out = np.repeat(src["ids_in_out"], decode_width, axis=0) tile_src_decoder_s = np.repeat(src["senti"], decode_width, axis=0) tile_tgt_decoder_s = get_tareget_sentiment(size=batch_size) tgt_decoder_s = get_tareget_sentiment(size=batch_size, random=True) t0 = time.time() # random random_predictions, log_probs = sess.run( [ s2ss_random_infer.predictions, s2ss_random_infer.log_probs ], feed_dict={ s2ss_random_infer.encoder_input: tile_src_ids, s2ss_random_infer.encoder_input_len: tile_src_length, s2ss_random_infer.decoder_s: tile_tgt_decoder_s }) mid_ids_log_prob = log_probs mid_ids, mid_ids_in, mid_ids_out, mid_ids_in_out, mid_ids_length = \ process_mid_ids(random_predictions, min_seq_len, max_seq_len, vocab_size) assert tile_src_length[0] == tile_src_length[decode_width - 1] # baseline greedy_predictions = sess.run( s2ss_greedy_infer.predictions, feed_dict={ s2ss_greedy_infer.encoder_input: src['ids'], s2ss_greedy_infer.encoder_input_len: src['length'], s2ss_greedy_infer.decoder_s: tgt_decoder_s }) mid_ids_bs, mid_ids_in_bs, mid_ids_out_bs, mid_ids_in_out_bs, mid_ids_length_bs = \ process_mid_ids(greedy_predictions, min_seq_len, max_seq_len, vocab_size) t0 = time.time() # == get reward from sentiment scorer/regressor def get_senti_reward(pred, gold): if args.scale_sentiment: gold = gold * 0.2 - 0.1 # todo: move this function to one file reward_ = 1 / (np.fabs(pred - gold) + 1.0) return reward_ # real sentiment reward pred_senti_score = sess.run(eval_reg.predict_score, feed_dict={ eval_reg.x: mid_ids, eval_reg.sequence_length: mid_ids_length }) senti_reward = get_senti_reward(pred_senti_score, tile_tgt_decoder_s) # upper bound of sentiment reward upper_pred_senti_score = sess.run(eval_reg.predict_score, feed_dict={ eval_reg.x: src["ids"], eval_reg.sequence_length: src["length"] }) upper_senti_reward = get_senti_reward(upper_pred_senti_score, src["senti"]) # lower bound of sentiment reward lower_pred_senti_score = sess.run( eval_reg.predict_score, feed_dict={ eval_reg.x: np.random.choice(vocab_size, np.shape(tile_src_ids)), eval_reg.sequence_length: tile_src_length }) lower_senti_reward = get_senti_reward(lower_pred_senti_score, tile_src_decoder_s) # == get reward from backward reconstruction feed_dict = { s2ss_train.encoder_input: mid_ids, s2ss_train.encoder_input_len: mid_ids_length, s2ss_train.decoder_input: tile_src_ids_in, s2ss_train.decoder_target: tile_src_ids_out, s2ss_train.decoder_target_len: tile_src_length + 1, s2ss_train.decoder_s: tile_src_decoder_s, } loss = sess.run(s2ss_train.loss_per_sequence, feed_dict=feed_dict) cont_reward = loss * (-1) # bigger is better t0 = time.time() # get baseline content reward feed_dict = { s2ss_train.encoder_input: mid_ids_bs, s2ss_train.encoder_input_len: mid_ids_length_bs, s2ss_train.decoder_input: src["ids_in"], s2ss_train.decoder_target: src["ids_out"], s2ss_train.decoder_target_len: src["length"] + 1, s2ss_train.decoder_s: src["senti"], } loss_bs = sess.run(s2ss_train.loss_per_sequence, feed_dict=feed_dict) cont_reward_bs = loss_bs * (-1) # baseline content reward # get lower bound of content reward feed_dict = { s2ss_train.encoder_input: np.random.choice(vocab_size, np.shape(mid_ids)), s2ss_train.encoder_input_len: mid_ids_length, s2ss_train.decoder_input: np.random.choice(vocab_size, np.shape(tile_src_ids_in)), s2ss_train.decoder_target: np.random.choice(vocab_size, np.shape(tile_src_ids_out)), s2ss_train.decoder_target_len: tile_src_length + 1, s2ss_train.decoder_s: tile_src_decoder_s, } lower_loss = sess.run(s2ss_train.loss_per_sequence, feed_dict=feed_dict) lower_cont_reward = lower_loss * (-1) # bigger is better def norm(x): x = np.array(x) x = (x - x.mean()) / (x.std() + 1e-6) # safe divide # x = x - x.min() # to make x > 0 return x def sigmoid(x, x_trans=0.0, x_scale=1.0, max_y=1, do_norm=False): value = max_y / (1 + np.exp(-(x - x_trans) * x_scale)) if do_norm: value = norm(value) return value def norm_s2ss_reward(x, baseline=None, scale=False, norm=False): x = np.reshape(x, (batch_size, -1)) # x in [-16, 0] dim1 = np.shape(x)[1] if baseline is not None: x_baseline = baseline # [batch_size] else: x_baseline = np.mean(x, axis=1) # [batch_size] x_baseline = np.repeat(x_baseline, dim1) # [batch_size*dim1] x_baseline = np.reshape(x_baseline, (batch_size, dim1)) x_norm = x - x_baseline if scale: x_norm = sigmoid(x_norm) if norm: x_norm = 2 * x_norm - 1 # new x_norm in [-1, 1] return x_norm.reshape(-1) if args.use_baseline: if global_step < 1: # only print at first 10 steps print('%%% use_baseline') cont_reward = norm_s2ss_reward(cont_reward, baseline=cont_reward_bs, scale=True) lower_cont_reward = norm_s2ss_reward( lower_cont_reward, baseline=cont_reward_bs, scale=True) elif args.scale_cont_reward: if global_step < 1: # only print at first 1 steps print('%%% scale_cont_reward') cont_reward = sigmoid( cont_reward, x_trans=-3) # [-6, -2] => [0.1, 0.78] lower_cont_reward = sigmoid(lower_cont_reward, x_trans=-3) if args.scale_senti_reward: if global_step < 1: # only print at first 1 steps print('%%% scale_senti_reward') senti_reward = sigmoid( senti_reward, x_trans=-0.8, x_scale=15) # [0.6, 1.0] => [0.04, 0.95] lower_senti_reward = sigmoid(lower_senti_reward, x_trans=-0.8, x_scale=15) upper_senti_reward = sigmoid(upper_senti_reward, x_trans=-0.8, x_scale=15) cont_reward_all["lower"].extend(lower_cont_reward) cont_reward_all["real"].extend(cont_reward) senti_reward_all["upper"].extend(upper_senti_reward) senti_reward_all["lower"].extend(lower_senti_reward) senti_reward_all["real"].extend(senti_reward) senti_reward += safe_divide_constant cont_reward += safe_divide_constant if args.increase_beta: beta = min(1, 0.1 * global_step / args.increase_step) else: beta = 1 reward_merge_type = 'H(sentiment, content), beta=%.2f' % beta # enlarge the influence of senti_reward reward = (1 + beta * beta) * senti_reward * cont_reward / ( beta * beta * senti_reward + cont_reward) reward_all.extend(reward) reward_expect_all.extend(reward * np.exp(mid_ids_log_prob)) # policy gradient training if not args.no_RL: feed_dict = { s2ss_train.encoder_input: tile_src_ids, s2ss_train.encoder_input_len: tile_src_length, s2ss_train.decoder_input: mid_ids_in, s2ss_train.decoder_target: mid_ids_out, s2ss_train.decoder_target_len: mid_ids_length + 1, s2ss_train.decoder_s: tile_tgt_decoder_s, s2ss_train.reward: reward } sess.run([s2ss_train.rl_loss, s2ss_train.retrain_op], feed_dict=feed_dict) # Teacher forcing data types: # 1. back translation data (greedy decode) # 2. back translation data (random decode) # 3. back translation noise data # 4. pseudo data # 5. same data (x->x) # 6. same_noise (x'->x) if "back_trans" in args.teacher_forcing: if args.MLE_decay: if args.MLE_decay_type == "linear": gap = min(10, 2 + global_step / args.MLE_decay_steps) # 10 after 1 epoch else: gap = min( 5, int(1 / np.power( args.MLE_decay_rate, global_step / args.MLE_decay_steps))) else: gap = 1 if n_batch % gap == 0: if global_step < 1: print( '$$$Update B use back-translated data (Update gap:%s)' % gap) # Update Seq2SentiSeq with previous model generated data # senti-, bleu+ feed_dict = { s2ss_train.encoder_input: mid_ids_bs, s2ss_train.encoder_input_len: mid_ids_length_bs, s2ss_train.decoder_input: src["ids_in"], s2ss_train.decoder_target: src["ids_out"], s2ss_train.decoder_target_len: src["length"] + 1, s2ss_train.decoder_s: src["senti"], } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) if "back_trans_random" in args.teacher_forcing: if args.MLE_decay: if args.MLE_decay_type == "linear": gap = min(10, 2 + global_step / args.MLE_decay_steps) # 10 after 1 epoch else: gap = min( 5, int(1 / np.power( args.MLE_decay_rate, global_step / args.MLE_decay_steps))) else: gap = 1 if n_batch % gap == 0: if global_step < 1: print( '$$$Update B use back_trans_random data (Update gap:%s)' % gap) # Update Seq2SentiSeq with previous model generated data with noise feed_dict = { s2ss_train.encoder_input: mid_ids, s2ss_train.encoder_input_len: mid_ids_length, s2ss_train.decoder_input: tile_src_ids_in, s2ss_train.decoder_target: tile_src_ids_out, s2ss_train.decoder_target_len: tile_src_length + 1, s2ss_train.decoder_s: tile_src_decoder_s, } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) if "back_trans_noise" in args.teacher_forcing: if args.MLE_decay: if args.MLE_decay_type == "linear": gap = min(10, 2 + global_step / args.MLE_decay_steps) # 10 after 1 epoch else: gap = min( 5, int(1 / np.power( args.MLE_decay_rate, global_step / args.MLE_decay_steps))) else: gap = 1 if n_batch % gap == 0: if global_step < 1: print( '$$$Update B use back_trans_noise data (Update gap:%s)' % gap) # Update Seq2SentiSeq with previous model generated data with noise noise_ids, noise_ids_length = add_noise( mid_ids_bs, mid_ids_length_bs) feed_dict = { s2ss_train.encoder_input: noise_ids, s2ss_train.encoder_input_len: noise_ids_length, s2ss_train.decoder_input: src["ids_in"], s2ss_train.decoder_target: src["ids_out"], s2ss_train.decoder_target_len: src["length"] + 1, s2ss_train.decoder_s: src["senti"], } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) if "pseudo_data" in args.teacher_forcing: # balance if args.MLE_decay: if args.MLE_decay_type == "linear": gap = min(10, 3 + global_step / args.MLE_decay_steps) # 10 after 1 epoch else: gap = min( 100, int(3 / np.power( args.MLE_decay_rate, global_step / args.MLE_decay_steps))) else: gap = 3 if n_batch % gap == 0: if global_step < 1: print('$$$Update use pseudo data (Update gap:%s)' % gap) data = sess.run( paired_train_data_next) # get real data!! feed_dict = { s2ss_train.encoder_input: data["source_ids"], s2ss_train.encoder_input_len: data["source_length"], s2ss_train.decoder_input: data["target_ids_in"], s2ss_train.decoder_target: data["target_ids_out"], s2ss_train.decoder_target_len: data["target_length"] + 1, s2ss_train.decoder_s: data["target_senti"] } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) if "same" in args.teacher_forcing: if args.same_decay: if args.same_decay_type == "linear": gap = min( 8, 2 + global_step / args.same_decay_steps) # 10 after 1 epoch else: gap = min( 10, int(2 / np.power( args.same_decay_rate, global_step / args.same_decay_rate))) else: gap = 2 if n_batch % gap == 0: print('$$$Update use same data (Update gap:%s)' % gap) # Update Seq2SentiSeq with target output # senti-, bleu+ feed_dict = { s2ss_train.encoder_input: src["ids"], s2ss_train.encoder_input_len: src["length"], s2ss_train.decoder_input: src["ids_in"], s2ss_train.decoder_target: src["ids_out"], s2ss_train.decoder_target_len: src["length"] + 1, s2ss_train.decoder_s: src["senti"] } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) if "same_noise" in args.teacher_forcing: if args.same_decay: if args.same_decay_type == "linear": gap = min( 8, 2 + global_step / args.same_decay_steps) # 10 after 1 epoch else: gap = min( 10, int(2 / np.power( args.same_decay_rate, global_step / args.same_decay_rate))) else: gap = 2 if n_batch % gap == 0: print('$$$Update use same_noise data (Update gap:%s)' % gap) noise_ids, noise_ids_length = add_noise( src["ids"], src["length"]) feed_dict = { s2ss_train.encoder_input: noise_ids, s2ss_train.encoder_input_len: noise_ids_length, s2ss_train.decoder_input: src["ids_in"], s2ss_train.decoder_target: src["ids_out"], s2ss_train.decoder_target_len: src["length"] + 1, s2ss_train.decoder_s: src["senti"] } sess.run([s2ss_train.loss, s2ss_train.train_op], feed_dict=feed_dict) except tf.errors.OutOfRangeError: # next epoch print("Train---Total N batch:{}\tCost time:{}".format( n_batch, time.time() - t0)) n_batch = -1 break
from models.classification.simple_net import simple_model from utils.data import load_dataset UC_MERCED_SIZE = 2100 TEST_SIZE = 10 TRAIN_SIZE = 100 - TEST_SIZE batch_size = 128 epochs = 100 IMG_HEIGHT = 256 IMG_WIDTH = 256 N_CHANNELS = 3 NUM_CLASSES = 21 ds_test, ds_test_size, ds_train, ds_train_size = load_dataset( dataset='uc_merced', batch_size=batch_size, dataset_size=UC_MERCED_SIZE, train_size=TRAIN_SIZE, test_size=TEST_SIZE, img_shape=(IMG_WIDTH, IMG_HEIGHT)) model = simple_model((IMG_HEIGHT, IMG_WIDTH, N_CHANNELS), logits=NUM_CLASSES) hist = model.fit(ds_train, validation_data=ds_test, steps_per_epoch=ds_train_size // batch_size, validation_steps=ds_test_size // batch_size, epochs=epochs)
if __name__ == '__main__': from config import Config parser = setup_argparse() parser.add_argument('-d', '--dataset_name', default='adult') parser.add_argument('-e', '--enc_dec', default='le') parser.add_argument('-s', '--subset', type=bool, default=False) args = get_parser_args(parser) EncoderDecoder = LabelEncoderDecoder if args.enc_dec == 'ohe': EncoderDecoder = OneHotEncoderDecoder sample_config = Config(args.dataset_name, 'default') data_version = 'clean' if args.subset: assert args.dataset_name != 'adult' data_version = 'clean_subset' clean_data = load_dataset(args.dataset_name, data_version) enc_dec_object = EncoderDecoder(sample_config) encoded_data = enc_dec_object.encode(clean_data) pdb.set_trace() if args.enc_dec == 'ohe': decoded_data = enc_dec_object.decode(clean_data, encoded_data) elif args.enc_dec == 'le': decoded_data = enc_dec_object.decode(encoded_data) pdb.set_trace()
def main(): filename = sys.argv[1] X = data.load_dataset('{}_X.npy'.format(filename)) Y = data.load_dataset('{}_Y.npy'.format(filename)) model = network.build_model() # vizualize the model network.vizualize_model(model, filename) # 80:20 # print network.train_model(model, (X, Y)) # score = model.evaluate(X, Y, verbose=0) # print 'Test score:', score[0] # K-Fold val_error = [] losses = [] kf = KFold(Y.shape[0], n_folds=FOLDS, shuffle=True, random_state=None) for train_index, val_index in kf: # Generate the dataset for this fold X_train, X_val = X[train_index], X[val_index] Y_train, Y_val = Y[train_index], Y[val_index] print X_train.shape, X_val.shape print Y_train.shape, Y_val.shape # Train the model on this dataset train_history, loss_history = network.train_model( model, (X_train, Y_train), (X_val, Y_val)) # TODO: save the losses to a file. losses.append(loss_history.losses) # Evaluate the model val_error = model.evaluate(X_val, Y_val, verbose=0) print 'Validation error:', val_error # NOTE: hack to run only one split break # Print final K-Fold error print "K-Fold Error: %0.2f (+/- %0.2f)" % (val_error.mean(), val_error.std() * 2) # Predict some labels # TODO: modify this to suit our image needs. counter = 0 while counter < 1: idx = random.choice(xrange(Y.shape[0])) prediction = network.predict_model(model, np.expand_dims(X[idx, :], axis=0)) print 'Testing: sample={}, prediction={}, actual={}'.format( idx, prediction, Y[idx, :]) # save this file data.generate_image(prediction) counter += 1 # dump the model to the file network.save_model(model, filename)
args = parse_args() if __name__ == "__main__": # hyper params seed = 1 fix_seed(seed) n_folds = 5 epochs = 200 batch_size = 512 # data train_df, test_df, sample_submit_df = load_dataset() X, X_test = glove(train_df, test_df) X = X.values.astype('float32') X_test = X_test.values.astype('float32') y = pd.get_dummies(train_df['jobflag']).values.astype('float32') trainset = JobInfoDataset(X, y, jobflag=train_df['jobflag'].values) testset = JobInfoDataset(X_test) # weight weight = get_weight(train_df) # ---------- Kfold ---------- # preds_for_test = [[0 for _ in range(4)] for _ in range(len(X_test))] cv = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=seed)
scaler2 = MinMaxScaler() data_scaled = scaler2.fit_transform(data) joblib.dump(scaler2, self.minmax_scaler_path) return data_scaled # returns numpy array def inverse_scale(self, data_scaled): scaler2 = joblib.load(self.minmax_scaler_path) data_unscaled = scaler2.inverse_transform(data_scaled) scaler = joblib.load(self.std_scaler_path) data = scaler.inverse_transform(data_unscaled) return data # returns numpy array if __name__ == '__main__': from config import Config sample_config = Config('lacity', 'default') clean_data = load_dataset('lacity', 'clean') enc_dec_object = LabelEncoderDecoder(sample_config) encoded_data = enc_dec_object.encode(clean_data) preprocess_obj = Preprocessor(sample_config) pdb.set_trace() scaled = preprocess_obj.scale(encoded_data) inv_scaled = preprocess_obj.inverse_scale(scaled)