def main(unused_argv): del unused_argv # Unused corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) save_dir = os.path.join(FLAGS.data_dir, "tfrecords") if not exists(save_dir): makedirs(save_dir) # test mode if FLAGS.per_host_test_bsz > 0: corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) return for split, batch_size in zip( ["train", "valid"], [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]): if batch_size <= 0: continue print("Converting {} set...".format(split)) corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS)
def main(unused_argv): del unused_argv # Unused corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) save_dir = os.path.join(FLAGS.data_dir, "tfrecords") if not exists(save_dir): makedirs(save_dir) # # test mode if FLAGS.per_host_test_bsz > 0: corpus.convert_to_tfrecords("test", save_dir, FLAGS.per_host_test_bsz, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) return for split, batch_size in zip( ["train", "valid"], [FLAGS.per_host_train_bsz, FLAGS.per_host_valid_bsz]): if batch_size <= 0: continue print("Converting {} set...".format(split)) corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, FLAGS.num_core_per_host, FLAGS=FLAGS) fn = os.path.join(FLAGS.data_dir, "cache.pkl") print("Saving dataset...") with open(fn, "wb") as fp: pickle.dump(corpus, fp, protocol=2)