예제 #1
0
def write_examples(job_id, args):
    """A single process creating and writing out pre-processed examples."""
    job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id))
    owt_dir = os.path.join(args.data_dir, "openwebtext")

    def log(*args):
        msg = " ".join(map(str, args))
        print("Job {}:".format(job_id), msg)

    log("Creating example writer")
    example_writer = build_pretraining_dataset.ExampleWriter(
        job_id=job_id,
        vocab_file=os.path.join(args.data_dir, "vocab.txt"),
        output_dir=os.path.join(args.data_dir, "pretrain_small_tfrecords"),
        max_seq_length=args.max_seq_length,
        num_jobs=args.num_processes,
        blanks_separate_docs=False,
        do_lower_case=args.do_lower_case)
    log("Writing tf examples")
    fnames = sorted(tf.io.gfile.listdir(owt_dir))
    fnames = [
        f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id
    ]
    random.shuffle(fnames)
    start_time = time.time()
    count = 0
    for file_no, fname in enumerate(fnames):
        if count >= MAX_DATA_ROW:
            break
        count = count + 1
        if file_no > 0 and file_no % 10 == 0:
            elapsed = time.time() - start_time
            log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
                "{:} examples written".format(
                    file_no, len(fnames), 100.0 * file_no / len(fnames),
                    int(elapsed),
                    int((len(fnames) - file_no) / (file_no / elapsed)),
                    example_writer.n_written))
        utils.rmkdir(job_tmp_dir)
        with tarfile.open(os.path.join(owt_dir, fname)) as f:
            f.extractall(job_tmp_dir)
        extracted_files = tf.io.gfile.listdir(job_tmp_dir)
        random.shuffle(extracted_files)
        for txt_fname in extracted_files:
            example_writer.write_examples(os.path.join(job_tmp_dir, txt_fname))
    example_writer.finish()
    log("Done!")
def write_examples(job_id, args):
    """A single process creating and writing out pre-processed examples."""
    job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id))
    owt_dir = os.path.join(args.data_dir, "wiki")

    def log(*args):
        msg = " ".join(map(str, args))
        print("Job {}:".format(job_id), msg)

    log("Creating example writer")
    example_writer = build_pretraining_dataset.ExampleWriter(
        job_id=job_id,
        model_file=os.path.join(args.model_dir, "wiki-ja.model"),
        vocab_file=os.path.join(args.model_dir, "wiki-ja.vocab"),
        output_dir=os.path.join(args.model_dir, "pretrain_tfrecords"),
        max_seq_length=args.max_seq_length,
        num_jobs=args.num_processes,
        blanks_separate_docs=False,
        do_lower_case=args.do_lower_case)
    log("Writing tf examples")
    fnames = tf.io.gfile.listdir(owt_dir)
    fnames = [f for f in fnames if '.' not in f]
    fnames = sorted(fnames)

    fnames = [
        f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id
    ]
    random.shuffle(fnames)
    for file_no, fname in enumerate(fnames):
        print('file number : {} of job_id: {}'.format(file_no, job_id))
        utils.rmkdir(job_tmp_dir)
        copy_tree(os.path.join(owt_dir, fname), job_tmp_dir)
        list_files = tf.io.gfile.listdir(job_tmp_dir)
        list_files = [fi for fi in list_files if fi != 'all.txt']
        for file_name in list_files:
            example_writer.write_examples(os.path.join(job_tmp_dir, file_name))
    example_writer.finish()
    log("Done!")