def main(args):
    # Init
    set_seed(args.seed)
    processor = glue_processor[args.task_name.lower()]
    tokenizer = BertTokenizer.from_pretrained(args.model_path,
                                              do_lower_case=True)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

    # Data
    dev_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)
    labels = processor.get_labels(args.data_dir)
    dev_data_raw = prepare_data(dev_examples, args.max_seq_len, tokenizer,
                                labels)
    test_data_raw = prepare_data(test_examples, args.max_seq_len, tokenizer,
                                 labels)

    # Model
    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.num_labels = len(labels)
    model = Model(model_config)
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt, strict=False)
    model.to(device)
    evaluate(model, dev_data_raw, 'dev')
    evaluate(model, test_data_raw, 'test')
def scatter_tsne(dataset_name):
    data_dict = prepare_data(dataset_name)
    x_train, y_train, _, _ = data_dict.values()

    num_label = len(np.unique(y_train))

    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    x_train_ = tsne.fit_transform(x_train)

    df = pd.DataFrame(np.concatenate([x_train_, y_train[:, None]], axis=1),
                      columns=["pca-one", "pca-two", "y"])

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x="pca-one",
                    y="pca-two",
                    hue="y",
                    palette=sns.color_palette("hls", num_label),
                    data=df,
                    legend="full",
                    alpha=0.7)
    plt.title(f"TSNE Visualization of Dataset: {dataset_name}")
    plt.show()
def scatter_pca(dataset_name):
    data_dict = prepare_data(dataset_name)
    x_train, y_train, _, _ = data_dict.values()

    num_label = len(np.unique(y_train))

    pca = decomposition.PCA(n_components=2)
    x_train_ = pca.fit_transform(x_train)

    df = pd.DataFrame(np.concatenate([x_train_, y_train[:, None]], axis=1),
                      columns=["pca-one", "pca-two", "y"])

    plt.figure(figsize=(8, 5))
    sns.scatterplot(x="pca-one",
                    y="pca-two",
                    hue="y",
                    palette=sns.color_palette("hls", num_label),
                    data=df,
                    legend="full",
                    alpha=0.7)
    plt.title(f"PCA Visualization of Dataset: {dataset_name}")
    plt.savefig("pca_" + dataset_name)
Exemple #4
0
def train(config):
    """Train a model using data."""
    # Prepare data.
    print("Preparing data in %s" % config.data_dir)
    train_ids, dev_ids, _ = data_utils.prepare_data(config.data_dir,
                                                    config.vocab_size)

    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        if not os.path.exists(FLAGS.model_dir):
            os.makedirs(FLAGS.model_dir)

        # Create model.
        print("Creating %d layers of %d units." %
              (config.num_layers, config.size))
        model = create_model(sess, config, False)

        if not config.probabilistic:
            model.kl_rate_update(0.0)

        train_writer = tf.summary.FileWriter(os.path.join(
            FLAGS.model_dir, "train"),
                                             graph=sess.graph)
        dev_writer = tf.summary.FileWriter(os.path.join(
            FLAGS.model_dir, "test"),
                                           graph=sess.graph)

        # Read data into buckets and compute their sizes.
        print("Reading development and training data (limit: %d)." %
              config.max_train_data_size)

        dev_set = read_data(dev_ids, config)
        train_set = read_data(train_ids, config, config.max_train_data_size)
        train_bucket_sizes = [
            len(train_set[b]) for b in xrange(len(config.buckets))
        ]
        train_total_size = float(sum(train_bucket_sizes))

        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        KL_loss = 0.0
        current_step = model.global_step.eval()
        step_loss_summaries = []
        step_KL_loss_summaries = []
        overall_start_time = time.time()
        print('Start training')
        while True:
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in xrange(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)
            _, step_loss, step_KL_loss, _ = model.step(sess, encoder_inputs,
                                                       decoder_inputs,
                                                       target_weights,
                                                       bucket_id, False,
                                                       config.probabilistic)

            if config.anneal and model.global_step.eval(
            ) > config.kl_rate_rise_time and model.kl_rate < 1:
                new_kl_rate = model.kl_rate.eval() + config.kl_rate_rise_factor
                sess.run(model.kl_rate_update,
                         feed_dict={'new_kl_rate': new_kl_rate})

            step_time += (time.time() -
                          start_time) / config.steps_per_checkpoint
            step_loss_summaries.append(
                tf.Summary(value=[
                    tf.Summary.Value(tag="step loss",
                                     simple_value=float(step_loss))
                ]))
            step_KL_loss_summaries.append(
                tf.Summary(value=[
                    tf.Summary.Value(tag="KL step loss",
                                     simple_value=float(step_KL_loss))
                ]))
            loss += step_loss / config.steps_per_checkpoint
            KL_loss += step_KL_loss / config.steps_per_checkpoint
            current_step = model.global_step.eval()

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % config.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float("inf")
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))

                print(
                    "global step %d learning rate %.4f step-time %.2f KL divergence "
                    "%.2f" % (model.global_step.eval(),
                              model.learning_rate.eval(), step_time, KL_loss))
                wall_time = time.time() - overall_start_time
                print("time passed: {0}".format(wall_time))

                # Add perplexity, KL divergence to summary and stats.
                perp_summary = tf.Summary(value=[
                    tf.Summary.Value(tag="train perplexity",
                                     simple_value=perplexity)
                ])
                train_writer.add_summary(perp_summary, current_step)
                KL_loss_summary = tf.Summary(value=[
                    tf.Summary.Value(tag="KL divergence", simple_value=KL_loss)
                ])
                train_writer.add_summary(KL_loss_summary, current_step)
                for i, summary in enumerate(step_loss_summaries):
                    train_writer.add_summary(summary, current_step - 200 + i)
                step_loss_summaries = []
                for i, summary in enumerate(step_KL_loss_summaries):
                    train_writer.add_summary(summary, current_step - 200 + i)
                step_KL_loss_summaries = []

                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.model_dir,
                                               FLAGS.model_name + ".ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss, KL_loss = 0.0, 0.0, 0.0

                # Run evals on development set and print their perplexity.
                eval_losses = []
                eval_KL_losses = []
                eval_bucket_num = 0
                for bucket_id in xrange(len(config.buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % (bucket_id))
                        continue
                    eval_bucket_num += 1
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        dev_set, bucket_id)
                    _, eval_loss, eval_KL_loss, _ = model.step(
                        sess, encoder_inputs, decoder_inputs, target_weights,
                        bucket_id, True, config.probabilistic)
                    eval_losses.append(float(eval_loss))
                    eval_KL_losses.append(float(eval_KL_loss))
                    eval_ppx = math.exp(
                        float(eval_loss)) if eval_loss < 300 else float("inf")
                    print("  eval: bucket %d perplexity %.2f" %
                          (bucket_id, eval_ppx))

                    eval_perp_summary = tf.Summary(value=[
                        tf.Summary.Value(tag="eval perplexity for bucket {0}".
                                         format(bucket_id),
                                         simple_value=eval_ppx)
                    ])
                    dev_writer.add_summary(eval_perp_summary, current_step)

                mean_eval_loss = sum(eval_losses) / float(eval_bucket_num)
                mean_eval_KL_loss = sum(eval_KL_losses) / float(
                    eval_bucket_num)
                mean_eval_ppx = math.exp(float(mean_eval_loss))
                print("  eval: mean perplexity {0}".format(mean_eval_ppx))

                eval_loss_summary = tf.Summary(value=[
                    tf.Summary.Value(tag="mean eval loss",
                                     simple_value=float(mean_eval_ppx))
                ])
                dev_writer.add_summary(eval_loss_summary, current_step)
                eval_KL_loss_summary = tf.Summary(value=[
                    tf.Summary.Value(tag="mean eval loss",
                                     simple_value=float(mean_eval_KL_loss))
                ])
                dev_writer.add_summary(eval_KL_loss_summary, current_step)
Exemple #5
0
  def train():
    """Train a query2vec model"""
    # Prepare train data.
    print("Preparing Seq2seq Model in %s" % FLAGS.train_dir)
    train_data, test_data, _ = data_utils.prepare_data(FLAGS.train_dir, FLAGS.vocab_size)
    checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.seq2seq_model)

    print("Loading training data from %s" % train_data)
    print("Loading development data from %s" % test_data)

    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options,
                                          intra_op_parallelism_threads=20)) as sess:
      # Create model.
      print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
      with tf.device("/gpu:0"):
        model = model_helper.create_model(sess, False)

      # Read data into buckets and compute their sizes.
      print("Reading development and training data (limit: %d)."
            % FLAGS.max_train_data_size)
      test_set = data_utils.read_data(test_data)
      train_set = data_utils.read_data(train_data, max_size=FLAGS.max_train_data_size)
      train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
      train_total_size = float(sum(train_bucket_sizes))

      # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
      # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
      # the size if i-th training bucket, as used later.
      train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                             for i in xrange(len(train_bucket_sizes))]

      # This is the training loop.
      step_time, loss = 0.0, 0.0
      current_step = 0
      previous_losses = []
      prev_loss = [1000000] * len(_buckets)

      train_writer = tf.summary.FileWriter(os.path.join("summary/train"), sess.graph)
      test_writer = tf.summary.FileWriter(os.path.join("summary/test"), sess.graph)
      while True:
        # Choose a bucket according to data distribution. We pick a random number
        # in [0, 1] and use the corresponding interval in train_buckets_scale.
        random_number_01 = np.random.random_sample()
        bucket_id = min([i for i in xrange(len(train_buckets_scale))
                         if train_buckets_scale[i] > random_number_01])

        # Get a batch and make a step.
        start_time = time.time()
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set[bucket_id], bucket_id)
        summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                target_weights, bucket_id, False)
        step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
        loss += step_loss / FLAGS.steps_per_checkpoint
        current_step += 1
        if current_step % FLAGS.steps_per_summary == 0:
          train_writer.add_summary(summaries, current_step)
          train_writer.flush()
          print('Step: %s' % current_step)
        # Once in a while, we save checkpoint, print statistics, and run evals.
        if current_step % FLAGS.steps_per_checkpoint == 0:
          # Print statistics for the previous epoch.
          perplexity = math.exp(loss) if loss < 300 else float('inf')
          print("global step %d learning rate %.4f step-time %.2f perplexity "
                "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                          step_time, perplexity))
          # Decrease learning rate if no improvement was seen over last 3 times.
          if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
            sess.run(model.learning_rate_decay_op)
          previous_losses.append(loss)
          # Save checkpoint and zero timer and loss.
          step_time, loss = 0.0, 0.0
          # Run evals on development set and print their perplexity.
          count = 0
          for bucket_id in xrange(len(_buckets)):
            if len(test_set[bucket_id]) == 0:
              print("  eval: empty bucket %d" % (bucket_id))
              continue
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              test_set[bucket_id], bucket_id)
            summaries, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                                    target_weights, bucket_id, True)
            test_writer.add_summary(summaries, current_step)
            eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
            if eval_ppx < prev_loss[bucket_id]:
              prev_loss[bucket_id] = eval_ppx
              count += 1
            print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))

          if count > len(_buckets) / 3:
            print("saving model...")
            model.saver.save(sess, checkpoint_path, global_step=model.global_step)
          sys.stdout.flush()
          test_writer.flush()
def train():
    # data preparation
    config = Config.ModelConfig()
    data_path, vocab_path = {}, {}
    data_path['train'] = FLAGS.data_train_path
    data_path['val'] = FLAGS.data_val_path
    data_path['test'] = FLAGS.data_test_path
    vocab_path['input'] = FLAGS.vocab_input_file
    vocab_path['output'] = FLAGS.vocab_output_file
    #inputs = (data_path, vocab_path, data_mode, config.num_input_symbols, None, None)
    #inputs = (data_path, vocab_path, None, None, None)
    inputs = (data_path, vocab_path)
    (vocab_input,rev_vocab_input), (vocab_output,rev_vocab_output), data = \
     data_utils.prepare_data('PTB_data', inputs)
    data_train, data_val = data['train'], data['val']
    config.max_training_steps = int(config.steps_per_checkpoint * round(
        len(data_train) * float(FLAGS.max_epochs) / config.batch_size /
        config.steps_per_checkpoint))
    config.num_input_symbols = len(vocab_input)
    config.num_output_symbols = len(vocab_output)
    print("maximum training steps: " + str(config.max_training_steps))

    step_time, losses = 0.0, 0.0
    f_losses, b_losses = 0.0, 0.0
    current_step = 0
    previous_losses = []
    with tf.Session() as sess:
        model = create_model(sess,
                             config,
                             'train',
                             False,
                             cell_mode=FLAGS.cell_mode)
        for i in xrange(config.max_training_steps):
            start_time = time.time()

            inputs, targets, target_weights = model.get_batch(data_train)
            b_loss, loss, f_loss = model.step(sess, inputs, targets,
                                              target_weights, False)

            # Once in a while, we save checkpoint, print statistics, and run evals.
            step_time += (time.time() -
                          start_time) / config.steps_per_checkpoint
            losses += loss / config.steps_per_checkpoint
            if isinstance(f_loss, np.float32):
                f_losses += f_loss / config.steps_per_checkpoint
            if isinstance(b_loss, np.float32):
                b_losses += b_loss / config.steps_per_checkpoint
            current_step += 1
            if current_step % config.steps_per_checkpoint == 0:
                perplexity = math.exp(losses) if losses < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                if f_losses:
                    print(' ' * 20 + "training forward perplexity %.2f" %
                          (math.exp(f_losses
                                    ) if f_losses < 300 else float('inf')))

                if b_losses:
                    print(' ' * 20 + "training backward perplexity %.2f" %
                          (math.exp(b_losses
                                    ) if b_losses < 300 else float('inf')))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and losses > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(losses)
                # Save checkpoint and zero timer and loss.
                model.saver.save(sess,
                                 FLAGS.checkpoint_path,
                                 global_step=model.global_step)
                step_time, losses = 0.0, 0.0
                f_losses, b_losses = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                val_inputs, val_targets, val_target_weights = model.get_batch(
                    data_val)
                _, val_loss, val_loss_fb = model.step(sess, val_inputs,
                                                      val_targets,
                                                      val_target_weights, True)
                val_ppx = math.exp(val_loss) if val_loss < 300 else float(
                    'inf')
                print("  val: perplexity %.2f" % val_ppx)
                if isinstance(val_loss_fb, list):
                    if len(val_loss_fb) == 2:
                        f_val_loss, b_val_loss = val_loss_fb
                    else:
                        f_val_loss, b_val_loss = val_loss_fb, None
                    if isinstance(f_val_loss, np.float32):
                        print("val: forward perplexity %.2f" %
                              (math.exp(f_val_loss)
                               if f_val_loss < 300 else float('inf')))
                    if isinstance(b_val_loss, np.float32):
                        print("val: backward perplexity %.2f" %
                              (math.exp(b_val_loss)
                               if b_val_loss < 300 else float('inf')))
                sys.stdout.flush()
def main(args):
    # Init
    set_seed(args.seed)
    processor = glue_processor[args.task_name.lower()]
    tokenizer = BertTokenizer.from_pretrained(args.model_path,
                                              do_lower_case=True)
    tokenizer.add_special_tokens(
        {"additional_special_tokens": ADDITIONAL_SPECIAL_TOKENS})

    # Data
    train_examples = processor.get_train_examples(args.data_dir)
    dev_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)
    labels = processor.get_labels(args.data_dir)
    train_data_raw = prepare_data(train_examples, args.max_seq_len, tokenizer,
                                  labels)
    dev_data_raw = prepare_data(dev_examples, args.max_seq_len, tokenizer,
                                labels)
    test_data_raw = prepare_data(test_examples, args.max_seq_len, tokenizer,
                                 labels)
    print("# train examples %d" % len(train_data_raw))
    print("# dev examples %d" % len(dev_data_raw))
    print("# test examples %d" % len(test_data_raw))
    train_data = ClassificationDataset(train_data_raw)
    train_dataloader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  sampler=RandomSampler(train_data))

    # Model
    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.num_labels = len(labels)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path,
                                            args.bert_config_path, 'assets')
    model = Model.from_pretrained(
        config=model_config, pretrained_model_name_or_path=args.bert_ckpt_path)
    model.to(device)

    # Optimizer
    num_train_steps = int(
        len(train_data_raw) / args.batch_size * args.n_epochs)
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, 0, num_train_steps)
    # scheduler = get_linear_schedule_with_warmup(optimizer, int(num_train_steps * warmup),num_train_steps)
    loss_fnc = nn.CrossEntropyLoss()

    # Training
    best_epoch = 0
    best_acc = 0.0
    train_pbar = trange(0, args.n_epochs, desc="Epoch")
    for epoch in range(args.n_epochs):
        batch_loss = []
        epoch_pbar = tqdm(train_dataloader, desc="Iteration", disable=False)
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = [
                b.to(device) if not isinstance(b, int) else b for b in batch
            ]
            input_ids, segment_ids, input_mask, label_ids, e1_mask, e2_mask = batch
            output = model(input_ids, segment_ids, input_mask, e1_mask,
                           e2_mask)
            loss = loss_fnc(output, label_ids)
            batch_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            scheduler.step()

            model.zero_grad()
            epoch_pbar.update(1)
            if (step + 1) % 100 == 0:
                print("[%d/%d] [%d/%d] mean_loss : %.3f" \
                      % (epoch + 1, args.n_epochs, step + 1,
                         len(train_dataloader), np.mean(batch_loss)))
        epoch_pbar.close()
        print('Epoch %d mean loss: %.3f' % (epoch + 1, np.mean(batch_loss)))
        acc = evaluate(model, dev_data_raw)
        if acc > best_acc:
            best_acc = acc
            best_epoch = epoch + 1
            save_path = os.path.join(args.save_dir, 'model_best.bin')
            torch.save(model.state_dict(), save_path)
        print("Best Score : ", best_acc, ' in epoch ', best_epoch, '.')
        train_pbar.update(1)
    train_pbar.close()
    ckpt = torch.load(os.path.join(args.save_dir, 'model_best.bin'))
    model.load_state_dict(ckpt)
    evaluate(model, test_data_raw, mode="test")
Exemple #8
0
    for dataset_name in args.dataset_list:

        model_path = "output", "models", args.model_name + "_" + dataset_name + ".pkl"

        train_results_list = []
        val_results_list = []
        test_results_list = []

        for rep in range(args.num_repeat):

            try:
                print("********************")
                print(f"Dataset: {dataset_name}, Repeat index: {rep+1}")

                data_dict = prepare_data(dataset_name, overwrite_flag=True)

                data_split = data_dict.values()
                x_train, y_train, x_test, y_test = data_split
                x_train_simulated, y_train_simulated = \
                    simulate_missing_labels(x_train, y_train, config_obj.simulation_params)

                if ~args.load:
                    model_params = params_dispatcher[args.model_name]
                    model = model_dispatcher[args.model_name](model_params)
                else:
                    with open(args.model_path, "rb") as f:
                        model = pickle.load(f)

                if "train" in args.mode:
                    model.fit(x_train_simulated, y_train_simulated)