Example #1
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(
        news_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.num_train_steps,
        num_warmup_steps=FLAGS.num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
    )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.train_batch_size,
        params={'model_dir': FLAGS.output_dir})

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(input_files=input_files,
                                      seq_length=FLAGS.max_seq_length,
                                      is_training=True)

    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
Example #2
0
    default=0.95,
    type=float,
    help='p to use for top p sampling. if this isn\'t none, use this for everthing'
)
parser.add_argument(
    '-samples',
    dest='samples',
    default=1,
    type=int,
    help='num_samples',
)

args = parser.parse_args()

encoder = get_encoder()
news_config = GroverConfig.from_json_file(args.model_config_fn)

# We might have to split the batch into multiple chunks if the batch size is too large
default_mbs = {12: 32, 24: 16, 48: 3}
max_batch_size = args.max_batch_size if args.max_batch_size is not None else default_mbs[news_config.num_hidden_layers]

# factorize args.batch_size = (num_chunks * batch_size_per_chunk) s.t. batch_size_per_chunk < max_batch_size
num_chunks = int(np.ceil(args.batch_size / max_batch_size))
batch_size_per_chunk = int(np.ceil(args.batch_size / num_chunks))
print("\n~~\nbatch size={}, max batch size={}, num chunks={}, batch size per chunk={}\n~~\n".format(
    args.batch_size, max_batch_size, num_chunks, batch_size_per_chunk), flush=True)

# This controls the top p for each generation.
top_p = np.ones((num_chunks, batch_size_per_chunk), dtype=np.float32) * args.top_p

# with open(args.metadata_fn, 'r') as f:
def main(_):
    LABEL_LIST = ['machine', 'human']
    LABEL_INV_MAP = {label: i for i, label in enumerate(LABEL_LIST)}

    tf.logging.set_verbosity(tf.logging.INFO)

    # These lines of code are just to check if we've already saved something into the directory
    if FLAGS.ingore_model_folder_check:
        pass
    elif tf.gfile.Exists(
            FLAGS.output_dir) or not FLAGS.ingore_model_folder_check:
        print(f"The output directory {FLAGS.output_dir} exists!")
        if FLAGS.do_train:
            print("EXITING BECAUSE DO_TRAIN is true", flush=True)
            return
        for split in ['val', 'test']:
            if tf.gfile.Exists(
                    os.path.join(FLAGS.output_dir,
                                 f'{split}-probs.npy')) and getattr(
                                     FLAGS, f'predict_{split}'):
                print(f"EXITING BECAUSE {split}-probs.npy exists", flush=True)
                return
        # Double check to see if it has trained!
        if not tf.gfile.Exists(os.path.join(FLAGS.output_dir, 'checkpoint')):
            print("EXITING BECAUSE NO CHECKPOINT.", flush=True)
            return
        stuff = {}
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'checkpoint'),
                           'r') as f:
            # model_checkpoint_path: "model.ckpt-0"
            # all_model_checkpoint_paths: "model.ckpt-0"
            for l in f:
                key, val = l.strip().split(': ', 1)
                stuff[key] = val.strip('"')
        if stuff['model_checkpoint_path'] == 'model.ckpt-0':
            print("EXITING BECAUSE IT LOOKS LIKE NOTHING TRAINED", flush=True)
            return
    elif not FLAGS.do_train:
        print("EXITING BECAUSE DO_TRAIN IS FALSE AND PATH DOESNT EXIST")
        return
    else:
        tf.gfile.MakeDirs(FLAGS.output_dir)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    # TODO might have to change this
    encoder = get_encoder()
    examples = {'train': [], 'val': [], 'test': []}
    np.random.seed(123456)
    tf.logging.info("*** Parsing files ***")
    with tf.gfile.Open(FLAGS.input_data, "r") as f:
        for l in f:
            item = json.loads(l)

            # This little hack is because we don't want to tokenize the article twice
            context_ids = _flatten_and_tokenize_metadata(encoder=encoder,
                                                         item=item)
            examples[item['split']].append({
                'info': item,
                'ids': context_ids,
                'label': item['label'],
            })
            assert item['label'] in LABEL_INV_MAP

    additional_data = {'machine': [], 'human': []}
    if FLAGS.additional_data is not None:
        print("NOW WERE LOOKING AT ADDITIONAL INPUT DATA", flush=True)
        with tf.gfile.Open(FLAGS.additional_data, "r") as f:
            for l in f:
                item = json.loads(l)
                # This little hack is because we don't want to tokenize the article twice
                context_ids = _flatten_and_tokenize_metadata(encoder=encoder,
                                                             item=item)
                additional_data[item['label']].append({
                    'info': item,
                    'ids': context_ids,
                    'label': item['label'],
                })

    tf.logging.info("*** Done parsing files ***")
    print("LETS GO", flush=True)
    if FLAGS.max_training_examples > 0:

        examples_by_label = {'human': [], 'machine': []}
        for x in examples['train']:
            examples_by_label[x['label']].append(x)

        new_examples = []
        print("Unique machine examples: {} -> {}".format(
            len(examples_by_label['machine']), FLAGS.max_training_examples),
              flush=True)
        machine_ex_to_keep = examples_by_label[
            'machine'][:FLAGS.max_training_examples]

        # So we just cut down on the TRUE machine examples. now lets try adding in additional examples
        # examples_by_label['human'].extend(additional_data['human'])

        if len(additional_data['machine']) > 0:
            amount_to_add = len(
                examples_by_label['human']) - len(machine_ex_to_keep)
            if amount_to_add > 0:
                machine_ex_to_keep.extend(
                    additional_data['machine'][:amount_to_add])

        for i, human_ex in enumerate(examples_by_label['human']):
            new_examples.append(human_ex)
            new_examples.append(machine_ex_to_keep[i %
                                                   len(machine_ex_to_keep)])

        print("Length of examples: {} -> {}".format(len(examples['train']),
                                                    len(new_examples)),
              flush=True)
        examples['train'] = new_examples

    # =============== SETUP TRAINING ===============
    if FLAGS.do_train:
        num_train_steps = int((len(examples['train']) / FLAGS.batch_size) *
                              FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
        assert num_train_steps > 0
    else:
        num_train_steps = None
        num_warmup_steps = None

    # =============== TRAINING BOILERPLATE ===============
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.iterations_per_loop,
        keep_checkpoint_max=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = classification_model_fn_builder(
        news_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        num_labels=len(LABEL_LIST),
        pool_token_id=encoder.begin_summary,
        adafactor=FLAGS.adafactor)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size,
        params={'model_dir': FLAGS.output_dir})
    # =============== END TRAINING BOILERPLATE ===============

    # =============== TRAINING ===============
    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")

        tf.logging.info(
            f"***** Recreating training file at {train_file} *****")
        classification_convert_examples_to_features(
            examples['train'],
            batch_size=FLAGS.batch_size,
            max_seq_length=FLAGS.max_seq_length,
            encoder=encoder,
            output_file=train_file,
            labels=LABEL_LIST,
            chop_from_front_if_needed=False)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(examples['train']))
        tf.logging.info("  Num epochs = %d", FLAGS.num_train_epochs)
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_input_fn = classification_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
        )
        estimator.train(input_fn=train_input_fn, steps=num_train_steps)
    # =============== END TRAINING ===============

    # =============== PREDICTION ===============
    splits_to_predict = [
        x for x in ['val', 'test'] if getattr(FLAGS, f'predict_{x}')
    ]
    for split in splits_to_predict:
        num_actual_examples = len(examples[split])

        predict_file = os.path.join(FLAGS.output_dir, f'{split}.tf_record')
        tf.logging.info(f"***** Recreating {split} file {predict_file} *****")
        classification_convert_examples_to_features(
            examples[split],
            batch_size=FLAGS.batch_size,
            max_seq_length=FLAGS.max_seq_length,
            encoder=encoder,
            output_file=predict_file,
            labels=LABEL_LIST,
            pad_extra_examples=True,
            chop_from_front_if_needed=False)

        val_input_fn = classification_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True,
        )
        # PREDICT
        probs = np.zeros((num_actual_examples, 2), dtype=np.float32)
        for i, res in enumerate(
                estimator.predict(input_fn=val_input_fn,
                                  yield_single_examples=True)):
            if i < num_actual_examples:
                probs[i] = res['probs']

        _save_np(os.path.join(FLAGS.output_dir, f'{split}-probs.npy'), probs)

        preds = np.argmax(probs, 1)
        labels = np.array([
            LABEL_INV_MAP[x['label']]
            for x in examples[split][:num_actual_examples]
        ])
        print('{} ACCURACY IS {:.3f}'.format(split, np.mean(labels == preds)),
              flush=True)
Example #4
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.iterations_per_loop,
        keep_checkpoint_max=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(news_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=1e-4,
                                num_train_steps=0,
                                num_warmup_steps=0,
                                use_tpu=FLAGS.use_tpu,
                                )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size,
        params={'model_dir': FLAGS.output_dir}
    )

    eval_input_fn = input_fn_builder(
        input_files=input_files,
        seq_length=FLAGS.max_seq_length,
        evaluate_for_fixed_number_of_steps=False,
        num_cpu_threads=1,
        is_training=False)
    result = [x for x in estimator.predict(input_fn=eval_input_fn, yield_single_examples=True)]
    cats = sorted(result[0].keys())
    result_stack = {cat: np.stack([x[cat] for x in result]) for cat in cats}

    with gcloudwriter(os.path.join(FLAGS.output_dir, FLAGS.validation_name)) as tempfile_name:
        with h5py.File(tempfile_name, 'w') as h5:
            for cat, data in result_stack.items():
                dtype2use = np.float16 if cat.endswith(('logprobs', 'top_p_required')) else np.uint16
                h5.create_dataset(cat, data=data.astype(dtype2use))
            h5.create_dataset('model', data=FLAGS.config_file)
            h5.create_dataset('ckpt', data=FLAGS.init_checkpoint)
            h5.create_dataset('input_file', data=FLAGS.input_file)

    # This gives the perplexity of the entire article. if you want to replicate the results of the paper you
    # might need to do something different to extract the ppl of just the body in particular.
    ppl_ex = []
    for logprobs_i, ids_i in zip(result_stack['gt_logprobs'], result_stack['labels']):
        # Omit the first token. Keep in mind input_ids is shifted by 1
        start_ind = ind_where(ids_i, target=50265, default_value=0)
        end_ind = ind_where(ids_i, target=50266, default_value=ids_i.shape[0] - 1)
        ppl_ex.append(logprobs_i[start_ind:end_ind])
    ppl_ex = np.concatenate(ppl_ex, 0)
    print("Article perplexity is {:.3f}".format(np.exp(-np.mean(ppl_ex))), flush=True)
Example #5
0
    :param item: Contains things that need to be tokenized
    fields are ['domain', 'date', 'authors', 'title', 'article', 'summary']
    :return: dict
    """
    metadata = []
    for key in ['domain', 'date', 'authors', 'title', 'article']:
        val = item.get(key, None)
        if val is not None:
            metadata.append(encoder.__dict__[f'begin_{key}'])
            metadata.extend(encoder.encode(val))
            metadata.append(encoder.__dict__[f'end_{key}'])
    return metadata


encoder = get_encoder()
news_config = GroverConfig.from_json_file(FLAGS.config_file)

model_fn = classification_model_fn_builder(
    news_config,
    init_checkpoint=FLAGS.init_checkpoint,
    learning_rate=FLAGS.learning_rate,
    num_train_steps=None,
    num_warmup_steps=None,
    use_tpu=FLAGS.use_tpu,
    num_labels=len(LABEL_LIST),
    pool_token_id=encoder.begin_summary,
    adafactor=FLAGS.adafactor)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
    cluster=None,