Python T2I.from_fileの例

プログラミング言語: Python

名前空間/パッケージ名: t2i

クラス/型: T2I

メソッド/関数: from_file

hotexamples.comのコード掲載数: 4

Python T2I.from_file - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのt2i.T2I.from_fileの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

build(21)

T2I(5)

from_file(4)

_check_corpus(1)

index(1)

load(1)

unindex(1)

コード例 #1

ファイルを表示

ファイル: t2i_test.py プロジェクト: zxlzr/token2index

    def test_count_vocab_file(self):
        """
        Test whether tokens are ignored when building the T2I object from a vocab file.
        """
        with warnings.catch_warnings(record=True) as caught_warnings:
            T2I.from_file(self.vocab_path, counter=self.counter)
            self.assertEqual(len(caught_warnings), 1)

        t2i = T2I.from_file(self.vocab_path,
                            counter=self.counter,
                            min_freq=self.min_freq)
        self.assertTrue(
            self._check_freq_filtering(t2i, self.counter, self.min_freq))

コード例 #2

ファイルを表示

ファイル: t2i_test.py プロジェクト: zxlzr/token2index

    def test_max_size(self):
        """
        Test whether indexing stops once maximum specified size of T2I object was reached.
        """
        # 1. Test during init
        index = {n: n for n in range(10)}
        t2i1 = T2I(index, max_size=3)
        self.assertEqual(len(t2i1), 3)
        self.assertTrue(all([i not in t2i1 for i in range(3, 10)]))

        # With special tokens
        t2i2 = T2I(index, max_size=10, special_tokens=("<mask>", "<flask>"))
        self.assertEqual(len(t2i2), 10)
        self.assertTrue(all([i not in t2i2 for i in range(6, 10)]))

        # 2. Test using build()
        corpus = "this is a long test sentence with exactly boring words"
        t2i3 = T2I.build(corpus, max_size=3)
        self.assertEqual(len(t2i3), 3)
        self.assertTrue(
            all([token not in t2i3 for token in corpus.split()[3:]]))
        self.assertTrue(all([i not in t2i3.indices() for i in range(3, 10)]))

        # With special tokens
        t2i4 = T2I.build(corpus,
                         max_size=10,
                         special_tokens=("<mask>", "<flask>"))
        self.assertEqual(len(t2i4), 10)
        self.assertTrue(
            all([token not in t2i4 for token in corpus.split()[6:]]))

        # 3. Test when building from file
        t2i5 = T2I.from_file(self.vocab_path, max_size=18)
        self.assertEqual(len(t2i5), 18)
        self.assertTrue(all([token not in t2i5 for token in self.tokens[16:]]))

        # With special tokens
        t2i6 = T2I.from_file(self.vocab_path,
                             max_size=21,
                             special_tokens=("<mask>", "<flask>"))
        self.assertEqual(len(t2i6), 21)
        self.assertTrue(all([token not in t2i6 for token in self.tokens[17:]]))

コード例 #3

ファイルを表示

ファイル: t2i_test.py プロジェクト: zxlzr/token2index

    def test_correct_indexing(self):
        """
        Test if indexing of new tokens is done correctly if the indices in the T2I class so far are arbitrary. In that
        case, indexing should be continued from the highest index.
        """
        t2i = T2I.from_file(self.vocab_path3)
        highest_index = max(t2i.indices())
        test_sent = "These are definitely new non-random tokens ."

        t2i = t2i.extend(test_sent)

        self.assertTrue(
            all([t2i[token] > highest_index
                 for token in test_sent.split(" ")]))

コード例 #4

ファイルを表示

ファイル: t2i_test.py プロジェクト: zxlzr/token2index

    def test_building_from_file(self):
        """
        Test building a T2I object from a vocab file.
        """
        # ### Proper vocab files ###
        # First vocab file format: One token per line
        t2i1 = T2I.from_file(self.vocab_path1)
        self.assertTrue([
            t2i1[token] == idx
            for token, idx in zip(self.tokens, range(len(self.tokens)))
        ])

        # Second vocab file format: Token and index, separated by tab
        t2i2 = T2I.from_file(self.vocab_path2)
        self.assertTrue([
            t2i2[token] == idx
            for token, idx in zip(self.tokens, self.indices2)
        ])

        # Second vocab file format, this time with higher indices
        t2i3 = T2I.from_file(self.vocab_path3)
        self.assertTrue([
            t2i3[token] == idx
            for token, idx in zip(self.tokens, self.indices3)
        ])

        # Second vocab file format, but with different delimiter
        t2i4 = T2I.from_file(self.vocab_path4, delimiter="###")
        self.assertTrue([
            t2i4[token] == idx
            for token, idx in zip(self.tokens, self.indices2)
        ])

        # unk, eos, special tokens already in vocab file
        t2i5 = T2I.from_file(self.vocab_path5,
                             special_tokens=("<mask>", "<flask>"))
        self.assertEqual(t2i1["<eos>"], t2i5["<eos>"])
        self.assertEqual(t2i1["<unk>"], t2i5["<unk>"])

        # unk, eos, special tokens already in vocab file, second format
        t2i5b = T2I.from_file(self.vocab_path5b,
                              special_tokens=("<mask>", "<flask>"))
        self.assertEqual(t2i1["<eos>"], t2i5b["<eos>"])
        self.assertEqual(t2i1["<unk>"], t2i5b["<unk>"])

        # ### Improper vocab files ###
        # Nonsensical format
        with self.assertRaises(ValueError):
            T2I.from_file(self.vocab_path6)

        # Mixed format
        with self.assertRaises(ValueError):
            T2I.from_file(self.vocab_path7)

        # Too many columns
        with self.assertRaises(ValueError):
            T2I.from_file(self.vocab_path8)

        # Second format but no ints in second column
        with self.assertRaises(ValueError):
            T2I.from_file(self.vocab_path9)