def main(_): tf.logging.set_verbosity(tf.logging.INFO) news_config = GroverConfig.from_json_file(FLAGS.config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder( news_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.train_batch_size, params={'model_dir': FLAGS.output_dir}) tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder(input_files=input_files, seq_length=FLAGS.max_seq_length, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
default=0.95, type=float, help='p to use for top p sampling. if this isn\'t none, use this for everthing' ) parser.add_argument( '-samples', dest='samples', default=1, type=int, help='num_samples', ) args = parser.parse_args() encoder = get_encoder() news_config = GroverConfig.from_json_file(args.model_config_fn) # We might have to split the batch into multiple chunks if the batch size is too large default_mbs = {12: 32, 24: 16, 48: 3} max_batch_size = args.max_batch_size if args.max_batch_size is not None else default_mbs[news_config.num_hidden_layers] # factorize args.batch_size = (num_chunks * batch_size_per_chunk) s.t. batch_size_per_chunk < max_batch_size num_chunks = int(np.ceil(args.batch_size / max_batch_size)) batch_size_per_chunk = int(np.ceil(args.batch_size / num_chunks)) print("\n~~\nbatch size={}, max batch size={}, num chunks={}, batch size per chunk={}\n~~\n".format( args.batch_size, max_batch_size, num_chunks, batch_size_per_chunk), flush=True) # This controls the top p for each generation. top_p = np.ones((num_chunks, batch_size_per_chunk), dtype=np.float32) * args.top_p # with open(args.metadata_fn, 'r') as f:
def main(_): LABEL_LIST = ['machine', 'human'] LABEL_INV_MAP = {label: i for i, label in enumerate(LABEL_LIST)} tf.logging.set_verbosity(tf.logging.INFO) # These lines of code are just to check if we've already saved something into the directory if FLAGS.ingore_model_folder_check: pass elif tf.gfile.Exists( FLAGS.output_dir) or not FLAGS.ingore_model_folder_check: print(f"The output directory {FLAGS.output_dir} exists!") if FLAGS.do_train: print("EXITING BECAUSE DO_TRAIN is true", flush=True) return for split in ['val', 'test']: if tf.gfile.Exists( os.path.join(FLAGS.output_dir, f'{split}-probs.npy')) and getattr( FLAGS, f'predict_{split}'): print(f"EXITING BECAUSE {split}-probs.npy exists", flush=True) return # Double check to see if it has trained! if not tf.gfile.Exists(os.path.join(FLAGS.output_dir, 'checkpoint')): print("EXITING BECAUSE NO CHECKPOINT.", flush=True) return stuff = {} with tf.gfile.Open(os.path.join(FLAGS.output_dir, 'checkpoint'), 'r') as f: # model_checkpoint_path: "model.ckpt-0" # all_model_checkpoint_paths: "model.ckpt-0" for l in f: key, val = l.strip().split(': ', 1) stuff[key] = val.strip('"') if stuff['model_checkpoint_path'] == 'model.ckpt-0': print("EXITING BECAUSE IT LOOKS LIKE NOTHING TRAINED", flush=True) return elif not FLAGS.do_train: print("EXITING BECAUSE DO_TRAIN IS FALSE AND PATH DOESNT EXIST") return else: tf.gfile.MakeDirs(FLAGS.output_dir) news_config = GroverConfig.from_json_file(FLAGS.config_file) # TODO might have to change this encoder = get_encoder() examples = {'train': [], 'val': [], 'test': []} np.random.seed(123456) tf.logging.info("*** Parsing files ***") with tf.gfile.Open(FLAGS.input_data, "r") as f: for l in f: item = json.loads(l) # This little hack is because we don't want to tokenize the article twice context_ids = _flatten_and_tokenize_metadata(encoder=encoder, item=item) examples[item['split']].append({ 'info': item, 'ids': context_ids, 'label': item['label'], }) assert item['label'] in LABEL_INV_MAP additional_data = {'machine': [], 'human': []} if FLAGS.additional_data is not None: print("NOW WERE LOOKING AT ADDITIONAL INPUT DATA", flush=True) with tf.gfile.Open(FLAGS.additional_data, "r") as f: for l in f: item = json.loads(l) # This little hack is because we don't want to tokenize the article twice context_ids = _flatten_and_tokenize_metadata(encoder=encoder, item=item) additional_data[item['label']].append({ 'info': item, 'ids': context_ids, 'label': item['label'], }) tf.logging.info("*** Done parsing files ***") print("LETS GO", flush=True) if FLAGS.max_training_examples > 0: examples_by_label = {'human': [], 'machine': []} for x in examples['train']: examples_by_label[x['label']].append(x) new_examples = [] print("Unique machine examples: {} -> {}".format( len(examples_by_label['machine']), FLAGS.max_training_examples), flush=True) machine_ex_to_keep = examples_by_label[ 'machine'][:FLAGS.max_training_examples] # So we just cut down on the TRUE machine examples. now lets try adding in additional examples # examples_by_label['human'].extend(additional_data['human']) if len(additional_data['machine']) > 0: amount_to_add = len( examples_by_label['human']) - len(machine_ex_to_keep) if amount_to_add > 0: machine_ex_to_keep.extend( additional_data['machine'][:amount_to_add]) for i, human_ex in enumerate(examples_by_label['human']): new_examples.append(human_ex) new_examples.append(machine_ex_to_keep[i % len(machine_ex_to_keep)]) print("Length of examples: {} -> {}".format(len(examples['train']), len(new_examples)), flush=True) examples['train'] = new_examples # =============== SETUP TRAINING =============== if FLAGS.do_train: num_train_steps = int((len(examples['train']) / FLAGS.batch_size) * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) assert num_train_steps > 0 else: num_train_steps = None num_warmup_steps = None # =============== TRAINING BOILERPLATE =============== tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.iterations_per_loop, keep_checkpoint_max=None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = classification_model_fn_builder( news_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, num_labels=len(LABEL_LIST), pool_token_id=encoder.begin_summary, adafactor=FLAGS.adafactor) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, params={'model_dir': FLAGS.output_dir}) # =============== END TRAINING BOILERPLATE =============== # =============== TRAINING =============== if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") tf.logging.info( f"***** Recreating training file at {train_file} *****") classification_convert_examples_to_features( examples['train'], batch_size=FLAGS.batch_size, max_seq_length=FLAGS.max_seq_length, encoder=encoder, output_file=train_file, labels=LABEL_LIST, chop_from_front_if_needed=False) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(examples['train'])) tf.logging.info(" Num epochs = %d", FLAGS.num_train_epochs) tf.logging.info(" Batch size = %d", FLAGS.batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = classification_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, ) estimator.train(input_fn=train_input_fn, steps=num_train_steps) # =============== END TRAINING =============== # =============== PREDICTION =============== splits_to_predict = [ x for x in ['val', 'test'] if getattr(FLAGS, f'predict_{x}') ] for split in splits_to_predict: num_actual_examples = len(examples[split]) predict_file = os.path.join(FLAGS.output_dir, f'{split}.tf_record') tf.logging.info(f"***** Recreating {split} file {predict_file} *****") classification_convert_examples_to_features( examples[split], batch_size=FLAGS.batch_size, max_seq_length=FLAGS.max_seq_length, encoder=encoder, output_file=predict_file, labels=LABEL_LIST, pad_extra_examples=True, chop_from_front_if_needed=False) val_input_fn = classification_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=True, ) # PREDICT probs = np.zeros((num_actual_examples, 2), dtype=np.float32) for i, res in enumerate( estimator.predict(input_fn=val_input_fn, yield_single_examples=True)): if i < num_actual_examples: probs[i] = res['probs'] _save_np(os.path.join(FLAGS.output_dir, f'{split}-probs.npy'), probs) preds = np.argmax(probs, 1) labels = np.array([ LABEL_INV_MAP[x['label']] for x in examples[split][:num_actual_examples] ]) print('{} ACCURACY IS {:.3f}'.format(split, np.mean(labels == preds)), flush=True)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) news_config = GroverConfig.from_json_file(FLAGS.config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.iterations_per_loop, keep_checkpoint_max=None, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(news_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=1e-4, num_train_steps=0, num_warmup_steps=0, use_tpu=FLAGS.use_tpu, ) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, params={'model_dir': FLAGS.output_dir} ) eval_input_fn = input_fn_builder( input_files=input_files, seq_length=FLAGS.max_seq_length, evaluate_for_fixed_number_of_steps=False, num_cpu_threads=1, is_training=False) result = [x for x in estimator.predict(input_fn=eval_input_fn, yield_single_examples=True)] cats = sorted(result[0].keys()) result_stack = {cat: np.stack([x[cat] for x in result]) for cat in cats} with gcloudwriter(os.path.join(FLAGS.output_dir, FLAGS.validation_name)) as tempfile_name: with h5py.File(tempfile_name, 'w') as h5: for cat, data in result_stack.items(): dtype2use = np.float16 if cat.endswith(('logprobs', 'top_p_required')) else np.uint16 h5.create_dataset(cat, data=data.astype(dtype2use)) h5.create_dataset('model', data=FLAGS.config_file) h5.create_dataset('ckpt', data=FLAGS.init_checkpoint) h5.create_dataset('input_file', data=FLAGS.input_file) # This gives the perplexity of the entire article. if you want to replicate the results of the paper you # might need to do something different to extract the ppl of just the body in particular. ppl_ex = [] for logprobs_i, ids_i in zip(result_stack['gt_logprobs'], result_stack['labels']): # Omit the first token. Keep in mind input_ids is shifted by 1 start_ind = ind_where(ids_i, target=50265, default_value=0) end_ind = ind_where(ids_i, target=50266, default_value=ids_i.shape[0] - 1) ppl_ex.append(logprobs_i[start_ind:end_ind]) ppl_ex = np.concatenate(ppl_ex, 0) print("Article perplexity is {:.3f}".format(np.exp(-np.mean(ppl_ex))), flush=True)
:param item: Contains things that need to be tokenized fields are ['domain', 'date', 'authors', 'title', 'article', 'summary'] :return: dict """ metadata = [] for key in ['domain', 'date', 'authors', 'title', 'article']: val = item.get(key, None) if val is not None: metadata.append(encoder.__dict__[f'begin_{key}']) metadata.extend(encoder.encode(val)) metadata.append(encoder.__dict__[f'end_{key}']) return metadata encoder = get_encoder() news_config = GroverConfig.from_json_file(FLAGS.config_file) model_fn = classification_model_fn_builder( news_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=None, num_warmup_steps=None, use_tpu=FLAGS.use_tpu, num_labels=len(LABEL_LIST), pool_token_id=encoder.begin_summary, adafactor=FLAGS.adafactor) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=None,