def main(args): # Init set_seed(args.seed) processor = glue_processor[args.task_name.lower()] tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case=True) tokenizer.add_special_tokens( {"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS}) # Data dev_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) labels = processor.get_labels(args.data_dir) dev_data_raw = prepare_data(dev_examples, args.max_seq_len, tokenizer, labels) test_data_raw = prepare_data(test_examples, args.max_seq_len, tokenizer, labels) # Model model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.num_labels = len(labels) model = Model(model_config) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt, strict=False) model.to(device) evaluate(model, dev_data_raw, 'dev') evaluate(model, test_data_raw, 'test')
def scatter_tsne(dataset_name): data_dict = prepare_data(dataset_name) x_train, y_train, _, _ = data_dict.values() num_label = len(np.unique(y_train)) tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) x_train_ = tsne.fit_transform(x_train) df = pd.DataFrame(np.concatenate([x_train_, y_train[:, None]], axis=1), columns=["pca-one", "pca-two", "y"]) plt.figure(figsize=(8, 5)) sns.scatterplot(x="pca-one", y="pca-two", hue="y", palette=sns.color_palette("hls", num_label), data=df, legend="full", alpha=0.7) plt.title(f"TSNE Visualization of Dataset: {dataset_name}") plt.show()
def scatter_pca(dataset_name): data_dict = prepare_data(dataset_name) x_train, y_train, _, _ = data_dict.values() num_label = len(np.unique(y_train)) pca = decomposition.PCA(n_components=2) x_train_ = pca.fit_transform(x_train) df = pd.DataFrame(np.concatenate([x_train_, y_train[:, None]], axis=1), columns=["pca-one", "pca-two", "y"]) plt.figure(figsize=(8, 5)) sns.scatterplot(x="pca-one", y="pca-two", hue="y", palette=sns.color_palette("hls", num_label), data=df, legend="full", alpha=0.7) plt.title(f"PCA Visualization of Dataset: {dataset_name}") plt.savefig("pca_" + dataset_name)
def train(config): """Train a model using data.""" # Prepare data. print("Preparing data in %s" % config.data_dir) train_ids, dev_ids, _ = data_utils.prepare_data(config.data_dir, config.vocab_size) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # Create model. print("Creating %d layers of %d units." % (config.num_layers, config.size)) model = create_model(sess, config, False) if not config.probabilistic: model.kl_rate_update(0.0) train_writer = tf.summary.FileWriter(os.path.join( FLAGS.model_dir, "train"), graph=sess.graph) dev_writer = tf.summary.FileWriter(os.path.join( FLAGS.model_dir, "test"), graph=sess.graph) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % config.max_train_data_size) dev_set = read_data(dev_ids, config) train_set = read_data(train_ids, config, config.max_train_data_size) train_bucket_sizes = [ len(train_set[b]) for b in xrange(len(config.buckets)) ] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 KL_loss = 0.0 current_step = model.global_step.eval() step_loss_summaries = [] step_KL_loss_summaries = [] overall_start_time = time.time() print('Start training') while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, step_KL_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False, config.probabilistic) if config.anneal and model.global_step.eval( ) > config.kl_rate_rise_time and model.kl_rate < 1: new_kl_rate = model.kl_rate.eval() + config.kl_rate_rise_factor sess.run(model.kl_rate_update, feed_dict={'new_kl_rate': new_kl_rate}) step_time += (time.time() - start_time) / config.steps_per_checkpoint step_loss_summaries.append( tf.Summary(value=[ tf.Summary.Value(tag="step loss", simple_value=float(step_loss)) ])) step_KL_loss_summaries.append( tf.Summary(value=[ tf.Summary.Value(tag="KL step loss", simple_value=float(step_KL_loss)) ])) loss += step_loss / config.steps_per_checkpoint KL_loss += step_KL_loss / config.steps_per_checkpoint current_step = model.global_step.eval() # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % config.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) print( "global step %d learning rate %.4f step-time %.2f KL divergence " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, KL_loss)) wall_time = time.time() - overall_start_time print("time passed: {0}".format(wall_time)) # Add perplexity, KL divergence to summary and stats. perp_summary = tf.Summary(value=[ tf.Summary.Value(tag="train perplexity", simple_value=perplexity) ]) train_writer.add_summary(perp_summary, current_step) KL_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="KL divergence", simple_value=KL_loss) ]) train_writer.add_summary(KL_loss_summary, current_step) for i, summary in enumerate(step_loss_summaries): train_writer.add_summary(summary, current_step - 200 + i) step_loss_summaries = [] for i, summary in enumerate(step_KL_loss_summaries): train_writer.add_summary(summary, current_step - 200 + i) step_KL_loss_summaries = [] # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name + ".ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss, KL_loss = 0.0, 0.0, 0.0 # Run evals on development set and print their perplexity. eval_losses = [] eval_KL_losses = [] eval_bucket_num = 0 for bucket_id in xrange(len(config.buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue eval_bucket_num += 1 encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, eval_KL_loss, _ = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, config.probabilistic) eval_losses.append(float(eval_loss)) eval_KL_losses.append(float(eval_KL_loss)) eval_ppx = math.exp( float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) eval_perp_summary = tf.Summary(value=[ tf.Summary.Value(tag="eval perplexity for bucket {0}". format(bucket_id), simple_value=eval_ppx) ]) dev_writer.add_summary(eval_perp_summary, current_step) mean_eval_loss = sum(eval_losses) / float(eval_bucket_num) mean_eval_KL_loss = sum(eval_KL_losses) / float( eval_bucket_num) mean_eval_ppx = math.exp(float(mean_eval_loss)) print(" eval: mean perplexity {0}".format(mean_eval_ppx)) eval_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="mean eval loss", simple_value=float(mean_eval_ppx)) ]) dev_writer.add_summary(eval_loss_summary, current_step) eval_KL_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="mean eval loss", simple_value=float(mean_eval_KL_loss)) ]) dev_writer.add_summary(eval_KL_loss_summary, current_step)
def train(): """Train a query2vec model""" # Prepare train data. print("Preparing Seq2seq Model in %s" % FLAGS.train_dir) train_data, test_data, _ = data_utils.prepare_data(FLAGS.train_dir, FLAGS.vocab_size) checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.seq2seq_model) print("Loading training data from %s" % train_data) print("Loading development data from %s" % test_data) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options, intra_op_parallelism_threads=20)) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) with tf.device("/gpu:0"): model = model_helper.create_model(sess, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) test_set = data_utils.read_data(test_data) train_set = data_utils.read_data(train_data, max_size=FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] prev_loss = [1000000] * len(_buckets) train_writer = tf.summary.FileWriter(os.path.join("summary/train"), sess.graph) test_writer = tf.summary.FileWriter(os.path.join("summary/test"), sess.graph) while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set[bucket_id], bucket_id) summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 if current_step % FLAGS.steps_per_summary == 0: train_writer.add_summary(summaries, current_step) train_writer.flush() print('Step: %s' % current_step) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. count = 0 for bucket_id in xrange(len(_buckets)): if len(test_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( test_set[bucket_id], bucket_id) summaries, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) test_writer.add_summary(summaries, current_step) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') if eval_ppx < prev_loss[bucket_id]: prev_loss[bucket_id] = eval_ppx count += 1 print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) if count > len(_buckets) / 3: print("saving model...") model.saver.save(sess, checkpoint_path, global_step=model.global_step) sys.stdout.flush() test_writer.flush()
def train(): # data preparation config = Config.ModelConfig() data_path, vocab_path = {}, {} data_path['train'] = FLAGS.data_train_path data_path['val'] = FLAGS.data_val_path data_path['test'] = FLAGS.data_test_path vocab_path['input'] = FLAGS.vocab_input_file vocab_path['output'] = FLAGS.vocab_output_file #inputs = (data_path, vocab_path, data_mode, config.num_input_symbols, None, None) #inputs = (data_path, vocab_path, None, None, None) inputs = (data_path, vocab_path) (vocab_input,rev_vocab_input), (vocab_output,rev_vocab_output), data = \ data_utils.prepare_data('PTB_data', inputs) data_train, data_val = data['train'], data['val'] config.max_training_steps = int(config.steps_per_checkpoint * round( len(data_train) * float(FLAGS.max_epochs) / config.batch_size / config.steps_per_checkpoint)) config.num_input_symbols = len(vocab_input) config.num_output_symbols = len(vocab_output) print("maximum training steps: " + str(config.max_training_steps)) step_time, losses = 0.0, 0.0 f_losses, b_losses = 0.0, 0.0 current_step = 0 previous_losses = [] with tf.Session() as sess: model = create_model(sess, config, 'train', False, cell_mode=FLAGS.cell_mode) for i in xrange(config.max_training_steps): start_time = time.time() inputs, targets, target_weights = model.get_batch(data_train) b_loss, loss, f_loss = model.step(sess, inputs, targets, target_weights, False) # Once in a while, we save checkpoint, print statistics, and run evals. step_time += (time.time() - start_time) / config.steps_per_checkpoint losses += loss / config.steps_per_checkpoint if isinstance(f_loss, np.float32): f_losses += f_loss / config.steps_per_checkpoint if isinstance(b_loss, np.float32): b_losses += b_loss / config.steps_per_checkpoint current_step += 1 if current_step % config.steps_per_checkpoint == 0: perplexity = math.exp(losses) if losses < 300 else float('inf') print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) if f_losses: print(' ' * 20 + "training forward perplexity %.2f" % (math.exp(f_losses ) if f_losses < 300 else float('inf'))) if b_losses: print(' ' * 20 + "training backward perplexity %.2f" % (math.exp(b_losses ) if b_losses < 300 else float('inf'))) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and losses > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(losses) # Save checkpoint and zero timer and loss. model.saver.save(sess, FLAGS.checkpoint_path, global_step=model.global_step) step_time, losses = 0.0, 0.0 f_losses, b_losses = 0.0, 0.0 # Run evals on development set and print their perplexity. val_inputs, val_targets, val_target_weights = model.get_batch( data_val) _, val_loss, val_loss_fb = model.step(sess, val_inputs, val_targets, val_target_weights, True) val_ppx = math.exp(val_loss) if val_loss < 300 else float( 'inf') print(" val: perplexity %.2f" % val_ppx) if isinstance(val_loss_fb, list): if len(val_loss_fb) == 2: f_val_loss, b_val_loss = val_loss_fb else: f_val_loss, b_val_loss = val_loss_fb, None if isinstance(f_val_loss, np.float32): print("val: forward perplexity %.2f" % (math.exp(f_val_loss) if f_val_loss < 300 else float('inf'))) if isinstance(b_val_loss, np.float32): print("val: backward perplexity %.2f" % (math.exp(b_val_loss) if b_val_loss < 300 else float('inf'))) sys.stdout.flush()
def main(args): # Init set_seed(args.seed) processor = glue_processor[args.task_name.lower()] tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case=True) tokenizer.add_special_tokens( {"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS}) # Data train_examples = processor.get_train_examples(args.data_dir) dev_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) labels = processor.get_labels(args.data_dir) train_data_raw = prepare_data(train_examples, args.max_seq_len, tokenizer, labels) dev_data_raw = prepare_data(dev_examples, args.max_seq_len, tokenizer, labels) test_data_raw = prepare_data(test_examples, args.max_seq_len, tokenizer, labels) print("# train examples %d" % len(train_data_raw)) print("# dev examples %d" % len(dev_data_raw)) print("# test examples %d" % len(test_data_raw)) train_data = ClassificationDataset(train_data_raw) train_dataloader = DataLoader(train_data, batch_size=args.batch_size, collate_fn=train_data.collate_fn, sampler=RandomSampler(train_data)) # Model model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.num_labels = len(labels) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') model = Model.from_pretrained( config=model_config, pretrained_model_name_or_path=args.bert_ckpt_path) model.to(device) # Optimizer num_train_steps = int( len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, 0, num_train_steps) # scheduler = get_linear_schedule_with_warmup(optimizer, int(num_train_steps * warmup),num_train_steps) loss_fnc = nn.CrossEntropyLoss() # Training best_epoch = 0 best_acc = 0.0 train_pbar = trange(0, args.n_epochs, desc="Epoch") for epoch in range(args.n_epochs): batch_loss = [] epoch_pbar = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(train_dataloader): model.train() batch = [ b.to(device) if not isinstance(b, int) else b for b in batch ] input_ids, segment_ids, input_mask, label_ids, e1_mask, e2_mask = batch output = model(input_ids, segment_ids, input_mask, e1_mask, e2_mask) loss = loss_fnc(output, label_ids) batch_loss.append(loss.item()) loss.backward() optimizer.step() scheduler.step() model.zero_grad() epoch_pbar.update(1) if (step + 1) % 100 == 0: print("[%d/%d] [%d/%d] mean_loss : %.3f" \ % (epoch + 1, args.n_epochs, step + 1, len(train_dataloader), np.mean(batch_loss))) epoch_pbar.close() print('Epoch %d mean loss: %.3f' % (epoch + 1, np.mean(batch_loss))) acc = evaluate(model, dev_data_raw) if acc > best_acc: best_acc = acc best_epoch = epoch + 1 save_path = os.path.join(args.save_dir, 'model_best.bin') torch.save(model.state_dict(), save_path) print("Best Score : ", best_acc, ' in epoch ', best_epoch, '.') train_pbar.update(1) train_pbar.close() ckpt = torch.load(os.path.join(args.save_dir, 'model_best.bin')) model.load_state_dict(ckpt) evaluate(model, test_data_raw, mode="test")
for dataset_name in args.dataset_list: model_path = "output", "models", args.model_name + "_" + dataset_name + ".pkl" train_results_list = [] val_results_list = [] test_results_list = [] for rep in range(args.num_repeat): try: print("********************") print(f"Dataset: {dataset_name}, Repeat index: {rep+1}") data_dict = prepare_data(dataset_name, overwrite_flag=True) data_split = data_dict.values() x_train, y_train, x_test, y_test = data_split x_train_simulated, y_train_simulated = \ simulate_missing_labels(x_train, y_train, config_obj.simulation_params) if ~args.load: model_params = params_dispatcher[args.model_name] model = model_dispatcher[args.model_name](model_params) else: with open(args.model_path, "rb") as f: model = pickle.load(f) if "train" in args.mode: model.fit(x_train_simulated, y_train_simulated)