Ejemplo n.º 1
0
def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary:
    d = Dictionary()
    for s in data:
        for token in s:
            d.add_symbol(token)
    d.finalize()
    return d
Ejemplo n.º 2
0
    def test_finalize(self):
        txt = [
            "A B C D",
            "B C D",
            "C D",
            "D",
        ]
        ref_ids1 = list(
            map(
                torch.IntTensor,
                [
                    [4, 5, 6, 7, 2],
                    [5, 6, 7, 2],
                    [6, 7, 2],
                    [7, 2],
                ],
            ))
        ref_ids2 = list(
            map(
                torch.IntTensor,
                [
                    [7, 6, 5, 4, 2],
                    [6, 5, 4, 2],
                    [5, 4, 2],
                    [4, 2],
                ],
            ))

        # build dictionary
        d = Dictionary()
        for line in txt:
            d.encode_line(line, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(dictionary.encode_line(line,
                                                  add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode="w") as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)
Ejemplo n.º 3
0
def dummy_dictionary(vocab_size, prefix="token_"):
    d = Dictionary()
    for i in range(vocab_size):
        token = prefix + str(i)
        d.add_symbol(token)
    d.finalize(padding_factor=1)  # don't add extra padding symbols
    return d
Ejemplo n.º 4
0
    def test_huffman_compresses(self):
        data = make_data()
        builder = make_code_builder(data)
        coder = builder.build_code()

        with TemporaryDirectory() as dirname:
            prefix = os.path.join(dirname, "huffman")
            build_dataset(prefix, data, coder)

            prefix_mmap = os.path.join(dirname, "mmap")
            mmap_builder = indexed_dataset.make_builder(
                indexed_dataset.data_file_path(prefix_mmap),
                "mmap",
                vocab_size=len(POPULATION),
            )
            dictionary = Dictionary()
            for c in POPULATION:
                dictionary.add_symbol(c)
            dictionary.finalize()
            for sentence in data:
                mmap_builder.add_item(dictionary.encode_line(" ".join(sentence)))
            mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap))

            huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size
            mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size
            self.assertLess(huff_size, mmap_size)
    def build_dictionary(cls,
                         filenames,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
Ejemplo n.º 6
0
def dummy_dictionary(vocab_size, prefix='token_'):
    d = Dictionary()
    for i in range(vocab_size):
        token = prefix + str(i)
        d.add_symbol(token)
    d.finalize(padding_factor=1)  # don't add extra padding symbols
    return d
Ejemplo n.º 7
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Build vocabulary from corpus data.")
    parser.add_argument(
        "--corpus-data",
        type=str,
        required=True,
        help=
        "The path pattern (glob) to all tokenized corpus files (train, test, val)."
    )
    parser.add_argument("--langs",
                        type=str,
                        required=True,
                        help="The pre-trained model languages.")
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="The vocabulary file.")
    args = parser.parse_args()

    langs = args.langs.split(",")
    ft_dict = Dictionary()
    for data_path in glob(args.corpus_data):
        Dictionary.add_file_to_dictionary(data_path, ft_dict, tokenize_line, 4)
    ft_dict.finalize(padding_factor=0)
    pad_dict(ft_dict, len(langs) + 1)
    ft_dict.save(args.output)
Ejemplo n.º 8
0
def build_word_dict(word_embed_path):
    word_dict = Dictionary()
    with open(word_embed_path, 'r') as f:
        for line in f:
            word = line.split(' ', 1)[0]
            word_dict.add_symbol(word)
    word_dict.finalize(padding_factor=1)
    return word_dict
 def build_dict(cls, filenames, word_level=False, workers=1, threshold=-1, nwords=-1, padding_factor=8):
     d = Dictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename,
                                           d,
                                           tokenize_line_word if word_level else tokenize_line_char,
                                           workers)
     d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
     return d
Ejemplo n.º 10
0
def build_sememe_dict(datapath):
    sememe_dict = Dictionary()
    with open(os.path.join(datapath, 'HowNet.edge'), 'r') as f:
        for line in f:
            sememes = line.strip().split('\t')[1]
            for s in sememes.split():
                sememe_dict.add_symbol(s)
    sememe_dict.finalize(threshold=5, padding_factor=1)
    return sememe_dict
Ejemplo n.º 11
0
 def to_dictionary(self) -> Dictionary:
     dictionary = Dictionary(bos=self.bos,
                             unk=self.unk,
                             pad=self.pad,
                             eos=self.eos)
     for n in self:
         dictionary.add_symbol(n.symbol, n=n.count)
     dictionary.finalize()
     return dictionary
Ejemplo n.º 12
0
 def get_bnids_dictionary(cls) -> Dictionary:
     if cls._bnids_dictionary is None:
         src_dictionary = cls.get_offsets_dictionary()
         tgt_dictionary = Dictionary()
         string_map = cls.get_offset_to_bnids_map()
         for idx, wn in enumerate(src_dictionary.symbols):
             if wn.startswith('wn:'):
                 tgt_dictionary.add_symbol(string_map[wn])
         tgt_dictionary.finalize()
         cls._bnids_dictionary = tgt_dictionary
     return cls._bnids_dictionary
Ejemplo n.º 13
0
    def test_finalize(self):
        txt = [
            'A B C D',
            'B C D',
            'C D',
            'D',
        ]
        ref_ids1 = list(map(torch.IntTensor, [
            [4, 5, 6, 7, 2],
            [5, 6, 7, 2],
            [6, 7, 2],
            [7, 2],
        ]))
        ref_ids2 = list(map(torch.IntTensor, [
            [7, 6, 5, 4, 2],
            [6, 5, 4, 2],
            [5, 4, 2],
            [4, 2],
        ]))

        # build dictionary
        d = Dictionary()
        for line in txt:
            Tokenizer.tokenize(line, d, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode='w') as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)
 def build_dictionary(filenames, src=False, tgt=False):
     assert src ^ tgt
     workers = args.workers
     threshold = args.thresholdsrc if src else args.thresholdtgt
     nwords = args.nwordssrc if src else args.nwordstgt
     padding_factor = args.padding_factor
     d = Dictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename, d,
                                           tokenizer.tokenize_line, workers,
                                           args.L)
     d.finalize(threshold=threshold,
                nwords=nwords,
                padding_factor=padding_factor)
     return d
Ejemplo n.º 15
0
    def build_dictionary(cls,
                         filenames,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): 文件名列表
            workers (int):  并发的线程数
            threshold (int): 定义最小出现的次数
            nwords (int): 定义最终词典中的单词总数,包括特殊符号
            padding_factor (int): 可用于将字典大小填充为8的倍数,这在某些硬件上很重要 (e.g., Nvidia Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
Ejemplo n.º 16
0
    def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1,
                         padding_factor=8):
        """Build the dictionary from edit-labeled raw text inputs.

        Each file contains tokenized sentences along with their token labels:
        ```text
        My teacher is going to move to change his job .
        0 0 0 0 0 0 0 0 0 0 0
        And he took in my favorite subject like soccer .
        0 0 0 0 0 0 1 0 0 0
        ...
        ```
        A dictionary is built using only the tokens and not token labels.

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            # Write only tokens to a separate file.
            with open(filename) as f_in, \
                    open(f"{filename}.tokens", "w") as f_out:
                f_out.writelines(line for i, line in enumerate(f_in)
                                 if i % 2 == 0)
            # Add tokens to dictionary with multiprocessing.
            Dictionary.add_file_to_dictionary(f"{filename}.tokens", d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords,
                   padding_factor=padding_factor)
        return d
Ejemplo n.º 17
0
    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")