Exemple #1
0
def make_dataset(tf_record_files: str,
                 num_treatments: int,
                 is_training: bool,
                 is_eval=False,
                 missing_outcomes=False,
                 do_masking=False,
                 input_pipeline_context=None):
    df_file = FLAGS.label_df_file
    dataset = load_basic_bert_data(
        tf_record_files,
        FLAGS.max_seq_length,
        is_training=is_training,
        input_pipeline_context=input_pipeline_context)

    label_df = pd.read_feather(df_file)
    dataset = dataset_labels_from_pandas(dataset, label_df)

    # todo: hardcoded for demo, but not the smartest way to do this
    def _standardize_label_naming(f, l):
        l['outcome'] = l.pop('accepted')
        l['treatment'] = l.pop('year')
        if missing_outcomes:
            l['outcome_observed'] = tf.not_equal(l['outcome'], -1)
        # placeholder so that passed in labels are non-negative
        l['outcome'] = tf.where(l['outcome_observed'], l['outcome'],
                                tf.zeros_like(l['outcome']))

        return f, l

    dataset = dataset.map(_standardize_label_naming,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = make_test_train_splits(dataset,
                                     num_splits=FLAGS.num_splits,
                                     dev_splits=FLAGS.dev_splits,
                                     test_splits=FLAGS.test_splits)

    if do_masking:
        tokenizer = tokenization.FullTokenizer(
            vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
        dataset = add_masking(dataset, tokenizer=tokenizer)

    if is_training:
        dataset = filter_training(dataset, is_training=not is_eval)

        # batching needs to happen before sample weights are created
        dataset = dataset.shuffle(25000)
        dataset = dataset.batch(FLAGS.train_batch_size, drop_remainder=True)

        # create sample weights and label outputs in the manner expected by keras
        hydra_keras_format = make_hydra_keras_format(
            num_treatments, missing_outcomes=missing_outcomes)
        dataset = dataset.map(hydra_keras_format,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset

    else:
        return dataset.batch(FLAGS.eval_batch_size)
def main(_):
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.io.gfile.glob(input_pattern))

    logging.info("*** Reading from input files ***")
    for input_file in input_files:
        logging.info("  %s", input_file)

    rng = random.Random(FLAGS.random_seed)
    instances = create_training_instances(
        input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
        FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
        rng)

    output_files = FLAGS.output_file.split(",")
    logging.info("*** Writing to output files ***")
    for output_file in output_files:
        logging.info("  %s", output_file)

    write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                    FLAGS.max_predictions_per_seq, output_files)
Exemple #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--shuffle_buffer_size', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--max_abs_len', type=int, default=128)

    args = parser.parse_args()

    # for easy debugging
    filename = '../dat/reddit/proc.tf_record'

    # bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    #                             trainable=True)
    # vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    vocab_file = '../pre-trained/uncased_L-12_H-768_A-12/vocab.txt'

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    num_splits = 10
    # dev_splits = [0]
    # test_splits = [0]
    dev_splits = []
    test_splits = [1]

    labeler = make_real_labeler('gender', 'log_score')

    input_dataset_from_filenames = make_input_fn_from_file(
        filename,
        args.max_abs_len,
        num_splits,
        dev_splits,
        test_splits,
        tokenizer,
        do_masking=True,
        is_training=True,
        filter_test=False,
        filter_train=True,
        shuffle_buffer_size=1000,
        labeler=labeler,
        seed=0,
        subreddits=[13])
    params = {'batch_size': 64}
    dataset = input_dataset_from_filenames(params)
    # dataset = filter_training(dataset)

    sample = next(iter(dataset))
    print(sample)

    print("start")
    for val in dataset.take(100):
        print("hit")
        sample = val
    print("end")

    sample = next(iter(dataset))
    print(sample)
def generate_tf_record_from_data_file(processor,
                                      data_dir,
                                      vocab_file,
                                      train_data_output_path=None,
                                      eval_data_output_path=None,
                                      max_seq_length=128,
                                      do_lower_case=True):
  """Generates and saves training data into a tf record file.

  Arguments:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
      data_dir: Directory that contains train/eval data to process. Data files
        should be in from "dev.tsv", "test.tsv", or "train.tsv".
      vocab_file: Text file with words to be used for training/evaluation.
      train_data_output_path: Output to which processed tf record for training
        will be saved.
      eval_data_output_path: Output to which processed tf record for evaluation
        will be saved.
      max_seq_length: Maximum sequence length of the to be generated
        training/eval data.
      do_lower_case: Whether to lower case input text.

  Returns:
      A dictionary containing input meta data.
  """
  assert train_data_output_path or eval_data_output_path

  label_list = processor.get_labels()
  tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)
  assert train_data_output_path
  train_input_data_examples = processor.get_train_examples(data_dir)
  file_based_convert_examples_to_features(train_input_data_examples, label_list,
                                          max_seq_length, tokenizer,
                                          train_data_output_path)
  num_training_data = len(train_input_data_examples)

  if eval_data_output_path:
    eval_input_data_examples = processor.get_dev_examples(data_dir)
    file_based_convert_examples_to_features(eval_input_data_examples,
                                            label_list, max_seq_length,
                                            tokenizer, eval_data_output_path)

  meta_data = {
      "task_type": "bert_classification",
      "processor_type": processor.get_processor_name(),
      "num_labels": len(processor.get_labels()),
      "train_data_size": num_training_data,
      "max_seq_length": max_seq_length,
  }

  if eval_data_output_path:
    meta_data["eval_data_size"] = len(eval_input_data_examples)

  return meta_data
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--shuffle_buffer_size', type=int, default=100)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--max_abs_len', type=int, default=250)

    args = parser.parse_args()

    # for easy debugging
    # tsv_file = "../../dat/PeerRead/proc/acl_2017.tf_record"
    # tsv_file = glob.glob('/home/victor/Documents/causal-spe-embeddings/dat/PeerRead/proc/*.tf_record')
    filename = '../dat/PeerRead/proc/arxiv-all.tf_record'

    # bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
    #                             trainable=True)
    # vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    vocab_file = '../pre-trained/uncased_L-12_H-768_A-12/vocab.txt'

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    num_splits = 10
    # dev_splits = [0]
    # test_splits = [0]
    dev_splits = []
    test_splits = [1, 2]

    # labeler = make_buzzy_based_simulated_labeler(0.5, 5.0, 0.0, 'simple',
    #                                              seed=0)

    labeler = make_real_labeler('venue', 'accepted')

    input_dataset_from_filenames = make_dataset_fn_from_file(
        filename,
        250,
        num_splits,
        dev_splits,
        test_splits,
        tokenizer,
        do_masking=False,
        is_training=True,
        filter_test=False,
        shuffle_buffer_size=25000,
        labeler=labeler,
        seed=0)
    params = {'batch_size': 10000}
    dataset = input_dataset_from_filenames(params)

    print(dataset.element_spec)

    for val in dataset.take(1):
        sample = val

    sample = next(iter(dataset))
    print(tf.unique(sample[1]['treatment']))
def make_dataset(is_training: bool, do_masking=False):
    if FLAGS.simulated == 'real':
        labeler = make_real_labeler(FLAGS.treatment, 'log_score')

    elif FLAGS.simulated == 'attribute':
        labeler = make_subreddit_based_simulated_labeler(FLAGS.beta0,
                                                         FLAGS.beta1,
                                                         FLAGS.gamma,
                                                         FLAGS.simulation_mode,
                                                         seed=0)
    else:
        Exception("simulated flag not recognized")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)]
    test_splits = [int(s) for s in str.split(FLAGS.test_splits)]

    if FLAGS.subreddits == '':
        subreddits = None
    else:
        subreddits = [int(s) for s in FLAGS.subreddits.split(',')]

    train_input_fn = make_input_fn_from_file(
        input_files_or_glob=FLAGS.input_files,
        seq_length=FLAGS.max_seq_length,
        num_splits=FLAGS.num_splits,
        dev_splits=dev_splits,
        test_splits=test_splits,
        tokenizer=tokenizer,
        do_masking=do_masking,
        subreddits=subreddits,
        is_training=is_training,
        shuffle_buffer_size=
        25000,  # note: bert hardcoded this, and I'm following suit
        seed=FLAGS.seed,
        labeler=labeler,
        filter_train=is_training)

    batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size

    dataset = train_input_fn(params={'batch_size': batch_size})

    # format expected by Keras for training
    if is_training:
        # dataset = filter_training(dataset)
        dataset = dataset.map(_keras_format,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.prefetch(4)

    return dataset
    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing", ","
        ]
        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
            if six.PY2:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
            else:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens
                                            ]).encode("utf-8"))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
        os.unlink(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertAllEqual(tokens,
                            ["un", "##want", "##ed", ",", "runn", "##ing"])

        self.assertAllEqual(tokenizer.convert_tokens_to_ids(tokens),
                            [7, 4, 5, 10, 8, 9])
Exemple #8
0
def generate_tf_record_from_json_file(input_file_path,
                                      vocab_file_path,
                                      output_path,
                                      max_seq_length=384,
                                      do_lower_case=True,
                                      max_query_length=64,
                                      doc_stride=128,
                                      version_2_with_negative=False):
    """Generates and saves training data into a tf record file."""
    train_examples = read_squad_examples(
        input_file=input_file_path,
        is_training=True,
        version_2_with_negative=version_2_with_negative)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                           do_lower_case=do_lower_case)
    train_writer = FeatureWriter(filename=output_path, is_training=True)
    number_of_examples = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=True,
        output_fn=train_writer.process_feature)
    train_writer.close()

    meta_data = {
        "task_type": "bert_squad",
        "train_data_size": number_of_examples,
        "max_seq_length": max_seq_length,
        "max_query_length": max_query_length,
        "doc_stride": doc_stride,
        "version_2_with_negative": version_2_with_negative,
    }

    return meta_data
def make_dataset(is_training: bool, do_masking=False):
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)]
    test_splits = [int(s) for s in str.split(FLAGS.test_splits)]

    train_input_fn = make_input_fn_from_file(
        input_files_or_glob=FLAGS.input_files,
        seq_length=FLAGS.max_seq_length,
        num_splits=FLAGS.num_splits,
        dev_splits=dev_splits,
        test_splits=test_splits,
        tokenizer=tokenizer,
        do_masking=do_masking,
        is_training=is_training,
        shuffle_buffer_size=25000,  # note: bert hardcoded this, and I'm following suit
        seed=FLAGS.seed)

    batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size

    dataset = train_input_fn(params={'batch_size': batch_size})
    dataset = dataset.map(_make_length_labels)
    return dataset
Exemple #10
0
def make_dataset(is_training: bool, do_masking=False):
    if FLAGS.simulated == 'real':
        labeler = make_real_labeler(FLAGS.treatment, 'accepted')

    elif FLAGS.simulated == 'attribute':
        labeler = make_buzzy_based_simulated_labeler(FLAGS.beta0,
                                                     FLAGS.beta1,
                                                     FLAGS.gamma,
                                                     FLAGS.simulation_mode,
                                                     seed=0)
    elif FLAGS.simulated == 'propensity':
        model_predictions = pd.read_csv(FLAGS.base_propensities_path, '\t')

        base_propensity_scores = model_predictions['g']
        example_indices = model_predictions['index']

        labeler = make_propensity_based_simulated_labeler(
            treat_strength=FLAGS.beta0,
            con_strength=FLAGS.beta1,
            noise_level=FLAGS.gamma,
            base_propensity_scores=base_propensity_scores,
            example_indices=example_indices,
            exogeneous_con=FLAGS.exogenous_confounding,
            setting=FLAGS.simulation_mode,
            seed=0)

    else:
        Exception("simulated flag not recognized")

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    dev_splits = [int(s) for s in str.split(FLAGS.dev_splits)]
    test_splits = [int(s) for s in str.split(FLAGS.test_splits)]

    train_input_fn = make_dataset_fn_from_file(
        input_files_or_glob=FLAGS.input_files,
        seq_length=FLAGS.max_seq_length,
        num_splits=FLAGS.num_splits,
        dev_splits=dev_splits,
        test_splits=test_splits,
        tokenizer=tokenizer,
        do_masking=do_masking,
        is_training=is_training,
        shuffle_buffer_size=
        25000,  # note: bert hardcoded this, and I'm following suit
        seed=FLAGS.seed,
        labeler=labeler)

    batch_size = FLAGS.train_batch_size if is_training else FLAGS.eval_batch_size

    dataset = train_input_fn(params={'batch_size': batch_size})

    # format expected by Keras for training
    if is_training:
        dataset = filter_training(dataset)
        dataset = dataset.map(_keras_format,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
def predict_squad(strategy, input_meta_data):
    """Makes predictions for a squad dataset_."""
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    doc_stride = input_meta_data['doc_stride']
    max_query_length = input_meta_data['max_query_length']
    # Whether data should be in Ver 2.0 format.
    version_2_with_negative = input_meta_data.get('version_2_with_negative',
                                                  False)
    eval_examples = squad_lib.read_squad_examples(
        input_file=FLAGS.predict_file,
        is_training=False,
        version_2_with_negative=version_2_with_negative)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    eval_writer = squad_lib.FeatureWriter(filename=os.path.join(
        FLAGS.model_dir, 'eval.tf_record'),
                                          is_training=False)
    eval_features = []

    def _append_feature(feature, is_padding):
        if not is_padding:
            eval_features.append(feature)
        eval_writer.process_feature(feature)

    # TPU requires a fixed batch size for all batches, therefore the number
    # of examples must be a multiple of the batch size, or else examples
    # will get dropped. So we pad with fake examples which are ignored
    # later on.
    dataset_size = squad_lib.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=input_meta_data['max_seq_length'],
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=False,
        output_fn=_append_feature,
        batch_size=FLAGS.predict_batch_size)
    eval_writer.close()

    logging.info('***** Running predictions *****')
    logging.info('  Num orig examples = %d', len(eval_examples))
    logging.info('  Num split examples = %d', len(eval_features))
    logging.info('  Batch size = %d', FLAGS.predict_batch_size)

    num_steps = int(dataset_size / FLAGS.predict_batch_size)
    all_results = predict_squad_customized(strategy, input_meta_data,
                                           bert_config, eval_writer.filename,
                                           num_steps)

    output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json')
    output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json')
    output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json')

    squad_lib.write_predictions(eval_examples,
                                eval_features,
                                all_results,
                                FLAGS.n_best_size,
                                FLAGS.max_answer_length,
                                FLAGS.do_lower_case,
                                output_prediction_file,
                                output_nbest_file,
                                output_null_log_odds_file,
                                verbose=FLAGS.verbose_logging)
Exemple #12
0
def run_customized_training(strategy, bert_config, max_seq_length,
                            max_predictions_per_seq, model_dir,
                            steps_per_epoch, steps_per_loop, epochs,
                            initial_lr, warmup_steps, input_files,
                            train_batch_size):
    """Run BERT pretrain model training using low-level API."""

    # train_input_fn = functools.partial(get_pretrain_input_data, input_files,
    #                                    max_seq_length, max_predictions_per_seq,
    #                                    train_batch_size, strategy)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    def train_input_fn():
        train_input_fn = make_input_fn_from_file(
            input_files_or_glob=FLAGS.input_files,
            seq_length=FLAGS.max_seq_length,
            num_splits=1,
            dev_splits=[2],
            test_splits=[2],
            tokenizer=tokenizer,
            is_training=True,
            shuffle_buffer_size=
            25000,  # note: bert hardcoded this, and I'm following suit
            seed=FLAGS.seed,
            labeler=None)
        return train_input_fn(params={'batch_size': train_batch_size})

    def _get_pretrain_model():
        """Gets a pretraining model."""
        pretrain_model, core_model = bert_models.pretrain_model(
            bert_config, max_seq_length, max_predictions_per_seq)
        pretrain_model.optimizer = optimization.create_optimizer(
            initial_lr, steps_per_epoch * epochs, warmup_steps)
        if FLAGS.fp16_implementation == 'graph_rewrite':
            # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
            # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
            # which will ensure tf.compat.v2.keras.mixed_precision and
            # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
            # up.
            pretrain_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                pretrain_model.optimizer)
        return pretrain_model, core_model

    trained_model = model_training_utils.run_customized_training_loop(
        strategy=strategy,
        model_fn=_get_pretrain_model,
        loss_fn=get_loss_fn(
            loss_factor=1.0 /
            strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0),
        model_dir=model_dir,
        train_input_fn=train_input_fn,
        steps_per_epoch=steps_per_epoch,
        steps_per_loop=steps_per_loop,
        epochs=epochs,
        init_checkpoint=FLAGS.init_checkpoint,
    )

    # Creates the BERT core model outside distribution strategy scope.
    _, core_model = bert_models.pretrain_model(bert_config, max_seq_length,
                                               max_predictions_per_seq)

    # Restores the core model from model checkpoints and get a new checkpoint only
    # contains the core model.
    model_saving_utils.export_pretraining_checkpoint(checkpoint_dir=model_dir,
                                                     model=core_model)
    return trained_model