Ejemplo n.º 1
0
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        self.vocab.count_file(os.path.join(path, "train.txt"))
        self.vocab.build_vocab()

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
        self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)

        vocab_len = len(self.vocab)
        self.cutoffs = [0, int(vocab_len * 0.1), int(vocab_len * 0.2), int(vocab_len * 0.4)] + [vocab_len]
        # self.cutoffs = []

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, num_core_per_host, **kwargs):
        file_names = []

        record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(split, bsz, tgt_len)

        record_info_path = os.path.join(save_dir, record_name)

        data = getattr(self, split)

        file_name, num_batch = create_ordered_tfrecords(save_dir, split, data, bsz, tgt_len)
        file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {
                "filenames": file_names,
                "num_batch": num_batch
            }
            json.dump(record_info, fp)
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):

        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        self.vocab.count_file(os.path.join(
            path, "train.txt"))  # 更新vocab对象里的counter(用于统计每个不同的词出现的次数)
        self.vocab.count_file(os.path.join(path, "valid.txt"))  # 同上,验证集中更新

        self.vocab.build_vocab()  # 这一步是为了建立idx2sym和sym2idx,把词映射为索引,把索引还原为词

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"),
                                            ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            ordered=True)

        # self.cutoffs = []  # 完全是多余的,从看代码的第一天开始,我就觉得cutoff是多余的,在今天被坑了一天之后,我终于可以确定在没有TPU的情况下,所有设涉及cutoff的代码都是多余的

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, **kwargs):
        file_names = []

        record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
            split, bsz, tgt_len)

        record_info_path = os.path.join(save_dir, record_name)
        bin_sizes = None

        file_name, num_batch = create_ordered_tfrecords(
            save_dir, split, getattr(self, split), bsz, tgt_len)

        file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {
                "filenames": file_names,
                "bin_sizes": bin_sizes,
                "num_batch": num_batch
            }
            json.dump(record_info, fp)
Ejemplo n.º 3
0
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        if self.dataset == "generic_dataset":
            encode_kwargs = dict(
                add_eos=kwargs.pop('add_eos', False),
                add_double_eos=kwargs.pop('add_double_eos', False),
                ordered=True,
                verbose=True,
            )
            if kwargs.get('vocab_file') is not None:
                kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file'])

        print(self.dataset, 'vocab params', kwargs)
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "generic_dataset" and not self.vocab.vocab_file:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset == "generic_dataset":
            self.train = self.vocab.encode_file(
                os.path.join(path, "train.txt"), **encode_kwargs)
            self.valid = self.vocab.encode_file(
                os.path.join(path, "valid.txt"), **encode_kwargs)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               **encode_kwargs)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "generic_dataset":
            with open(os.path.join(path, "cutoffs.json")) as f:
                self.cutoffs = json.load(f)
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len,
                             num_core_per_host, **kwargs):
        FLAGS = kwargs.get('FLAGS')

        file_names = []
        use_tpu = FLAGS.use_tpu and not (split == "test"
                                         and num_core_per_host == 1)

        if use_tpu:
            record_name = "record_info-{}.bsz-{}.tlen-{}.core-{}.json".format(
                split, bsz, tgt_len, num_core_per_host)
        else:
            record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
                split, bsz, tgt_len)

        record_info_path = os.path.join(save_dir, record_name)

        if self.dataset in [
                "ptb", "wt2", "wt103", "enwik8", "text8", "generic_dataset"
        ]:
            data = getattr(self, split)
            bin_sizes = get_bin_sizes(data, bsz // num_core_per_host, tgt_len,
                                      self.cutoffs)
            file_name, num_batch = create_ordered_tfrecords(
                save_dir,
                split,
                data,
                bsz,
                tgt_len,
                num_core_per_host,
                self.cutoffs,
                bin_sizes,
                num_passes=FLAGS.num_passes
                if split == 'train' and use_tpu else 1,
                use_tpu=use_tpu)
            file_names.append(file_name)
        elif self.dataset == "lm1b":
            bin_sizes = get_bin_sizes(self.valid, bsz // num_core_per_host,
                                      tgt_len, self.cutoffs)
            if split == "train":
                np.random.seed(123456)
                num_batch = 0

                if FLAGS.num_procs > 1:
                    _preprocess_wrapper = partial(
                        _preprocess,
                        train=self.train,
                        vocab=self.vocab,
                        save_dir=save_dir,
                        cutoffs=self.cutoffs,
                        bin_sizes=bin_sizes,
                        bsz=bsz,
                        tgt_len=tgt_len,
                        num_core_per_host=num_core_per_host,
                        use_tpu=use_tpu,
                        num_shuffle=FLAGS.num_shuffle)

                    pool = mp.Pool(processes=FLAGS.num_procs)
                    results = pool.map(_preprocess_wrapper,
                                       range(len(self.train)))
                    for res in results:
                        file_names.extend(res[0])
                        num_batch += res[1]
                else:
                    for shard, path in enumerate(self.train):
                        data_shard = self.vocab.encode_file(
                            path, ordered=False, add_double_eos=True)

                        num_shuffle = FLAGS.num_shuffle

                        for shuffle in range(num_shuffle):
                            print("Processing shard {} shuffle {}".format(
                                shard, shuffle))
                            basename = "train-{:03d}-{:02d}".format(
                                shard, shuffle)
                            np.random.shuffle(data_shard)
                            file_name, num_batch_ = create_ordered_tfrecords(
                                save_dir,
                                basename,
                                np.concatenate(data_shard),
                                bsz,
                                tgt_len,
                                num_core_per_host,
                                self.cutoffs,
                                bin_sizes,
                                use_tpu=use_tpu)
                            file_names.append(file_name)
                            num_batch += num_batch_

            else:
                file_name, num_batch = create_ordered_tfrecords(
                    save_dir,
                    split,
                    getattr(self, split),
                    bsz,
                    tgt_len,
                    num_core_per_host,
                    self.cutoffs,
                    bin_sizes,
                    use_tpu=use_tpu)
                file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {
                "filenames": file_names,
                "bin_sizes": bin_sizes,
                "num_batch": num_batch
            }
            json.dump(record_info, fp)
Ejemplo n.º 4
0
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)
        if self.vocab.vocab_file == None:
            self.vocab.count_file(os.path.join(path, "train.txt"))
        self.vocab.build_vocab()

        self.train = self.vocab.encode_file(os.path.join(path, "train.txt"),
                                            add_eos=True,
                                            ordered=True)
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            add_eos=True,
                                            ordered=True)
        self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                           add_eos=True,
                                           ordered=True)

        self.cutoffs = []

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len,
                             num_core_per_host, **kwargs):
        FLAGS = kwargs.get('FLAGS')

        file_names = []
        use_tpu = FLAGS.use_tpu and not (split == "test"
                                         and num_core_per_host == 1)

        if use_tpu:
            record_name = "record_info-{}.bsz-{}.tlen-{}.core-{}.json".format(
                split, bsz, tgt_len, num_core_per_host)
        else:
            record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
                split, bsz, tgt_len)

        record_info_path = os.path.join(save_dir, record_name)

        data = getattr(self, split)

        bin_sizes = get_bin_sizes(data, bsz // num_core_per_host, tgt_len,
                                  self.cutoffs)
        file_name, num_batch = create_ordered_tfrecords(
            save_dir,
            split,
            data,
            bsz,
            tgt_len,
            num_core_per_host,
            self.cutoffs,
            bin_sizes,
            num_passes=FLAGS.num_passes if split == 'train' and use_tpu else 1,
            use_tpu=use_tpu)
        file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {
                "filenames": file_names,
                "bin_sizes": bin_sizes,
                "num_batch": num_batch
            }
            json.dump(record_info, fp)
Ejemplo n.º 5
0
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        train_path = os.path.join(path, "train.txt")
        valid_path = os.path.join(path, "valid.txt")
        # test_path = os.path.join(path, "test.txt")

        # self.vocab.count_file(train_path)
        # self.vocab.count_file(valid_path)
        # self.vocab.count_file(test_path)
        self.vocab.build_vocab(add_bytes=True)

        self.train = train_path
        self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"),
                                            ordered=True,
                                            add_eos=False)
        # self.test  = self.vocab.encode_file(
        #     os.path.join(path, "test.txt"), ordered=True, add_eos=False)
        self.cutoffs = []

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len,
                             num_core_per_host, **kwargs):
        FLAGS = kwargs.get('FLAGS')

        file_names = []
        use_tpu = FLAGS.use_tpu and not (split == "test"
                                         and num_core_per_host == 1)

        record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
            split, bsz, tgt_len)
        record_info_path = os.path.join(save_dir, record_name)

        # pretty sure this is a tpu only thing
        bin_sizes = []

        if split == "train":
            np.random.seed(123456)
            num_batch = 0

            for shard, shard_c in self.file_sharder(self.train,
                                                    FLAGS.train_shard_size):

                print("Processing shard {}".format(shard_c))
                basename = "train-{:03d}".format(shard_c)
                file_name, num_batch_ = create_ordered_tfrecords(
                    save_dir,
                    basename,
                    shard,
                    bsz,
                    tgt_len,
                    num_core_per_host,
                    self.cutoffs,
                    bin_sizes,
                    use_tpu=use_tpu)
                file_names.append(file_name)
                num_batch += num_batch_

        else:
            file_name, num_batch = create_ordered_tfrecords(save_dir,
                                                            split,
                                                            getattr(
                                                                self, split),
                                                            bsz,
                                                            tgt_len,
                                                            num_core_per_host,
                                                            self.cutoffs,
                                                            bin_sizes,
                                                            use_tpu=use_tpu)
            file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {
                "filenames": file_names,
                "bin_sizes": bin_sizes,
                "num_batch": num_batch
            }
            json.dump(record_info, fp)

    def file_sharder(self, file_name, shard_size):
        """ Shard a file into manageable sizes. """
        cur_shard_size = 0
        cur_shard = []

        count = 0
        with open(file_name, 'r') as f:
            for line in f:
                toks = self.vocab.tokenize(line)
                cur_shard.append(self.vocab.convert_to_nparray(toks))
                cur_shard_size += len(toks)

                if cur_shard_size >= shard_size:
                    cur_shard = np.concatenate(cur_shard)
                    print("Compiled shard of size {}".format(cur_shard_size))
                    yield cur_shard, count

                    cur_shard = []
                    cur_shard_size = 0
                    count += 1

            # want at least more than 50MB to write a shard
            if cur_shard_size >= 50000000:
                cur_shard = np.concatenate(cur_shard)
                yield cur_shard, count
Ejemplo n.º 6
0
class Corpus(object):
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8", "sb2", "sb92"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset in ["wt103", "wt103small"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "sb2", "sb92"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset in ["wt2", "wt103", "wt103small"]:
            self.train, self.train_boundary = self.vocab.encode_file(
                os.path.join(path, "train.txt"),
                ordered=True,
                ret_doc_boundary=True,
                pattern="\=[^=]+\=")
            self.valid, self.valid_boundary = self.vocab.encode_file(
                os.path.join(path, "valid.txt"),
                ordered=True,
                ret_doc_boundary=True,
                pattern="\=[^=]+\=")
            self.test, self.test_boundary = self.vocab.encode_file(
                os.path.join(path, "test.txt"),
                ordered=True,
                ret_doc_boundary=True,
                pattern="\=[^=]+\=")
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "sb92":
            self.cutoffs = [0, 10000, 20000] + [len(self.vocab)]
        elif self.dataset == "wt103small":
            self.cutoffs = [0, 20000, 40000] + [len(self.vocab)]
        elif self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []

    def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, **kwargs):
        FLAGS = kwargs.get('FLAGS')

        file_names = []

        record_name = "record_info-{}.bsz-{}.tlen-{}.json".format(
            split, bsz, tgt_len)

        record_info_path = os.path.join(save_dir, record_name)

        if self.dataset in ["ptb", "enwik8", "text8", "sb2", "sb92"]:
            data = getattr(self, split)
            file_name, num_batch = create_ordered_tfrecords(
                save_dir,
                split,
                data,
                bsz,
                tgt_len,
                num_passes=FLAGS.num_passes)
            file_names.append(file_name)
        if self.dataset in ["wt2", "wt103", "wt103small"]:
            data = getattr(self, split)
            boundary = getattr(self, split + "_boundary")
            file_name, num_batch = create_ordered_tfrecords(
                save_dir,
                split,
                data,
                bsz,
                tgt_len,
                num_passes=FLAGS.num_passes,
                boundary=boundary)
            file_names.append(file_name)
        elif self.dataset == "lm1b":
            if split == "train":
                np.random.seed(123456)
                num_batch = 0

                if FLAGS.num_procs > 1:
                    _preprocess_wrapper = partial(
                        _preprocess,
                        train=self.train,
                        vocab=self.vocab,
                        save_dir=save_dir,
                        bsz=bsz,
                        tgt_len=tgt_len,
                        num_shuffle=FLAGS.num_shuffle)

                    pool = mp.Pool(processes=FLAGS.num_procs)
                    results = pool.map(_preprocess_wrapper,
                                       range(len(self.train)))
                    for res in results:
                        file_names.extend(res[0])
                        num_batch += res[1]
                else:
                    for shard, path in enumerate(self.train):
                        data_shard = self.vocab.encode_file(
                            path, ordered=False, add_double_eos=True)

                        num_shuffle = FLAGS.num_shuffle

                        for shuffle in range(num_shuffle):
                            print("Processing shard {} shuffle {}".format(
                                shard, shuffle))
                            basename = "train-{:03d}-{:02d}".format(
                                shard, shuffle)
                            np.random.shuffle(data_shard)
                            file_name, num_batch_ = create_ordered_tfrecords(
                                save_dir, basename, np.concatenate(data_shard),
                                bsz, tgt_len)
                            file_names.append(file_name)
                            num_batch += num_batch_

            else:
                file_name, num_batch = create_ordered_tfrecords(
                    save_dir, split, getattr(self, split), bsz, tgt_len)
                file_names.append(file_name)

        with open(record_info_path, "w") as fp:
            record_info = {"filenames": file_names, "num_batch": num_batch}
            if self.dataset in ["wt2", "wt103", "wt103small"]:
                record_info["boundary"] = True
            else:
                record_info["boundary"] = False
            json.dump(record_info, fp)