Exemple #1
0
 def test_all_charset(self):
     all_set = JapaneseUnicodes('all')
     self.assertTrue(all_set.in_charset('fF00'))
     self.assertTrue(all_set.in_charset('U+303f'))
     self.assertTrue(all_set.in_charset('80FD'))
     self.assertTrue(all_set.in_charset('U+3074'))
     self.assertTrue(all_set.in_charset('30cf'))
     self.assertFalse(all_set.in_charset('U+1708'))
Exemple #2
0
    def character_set(self):
        """The Japanese characters (e.g. kana, kanji) of interest.

        Preset character sets may include the following component sets:

            * hiragana
            * katakana
            * kana
            * kanji
            * punct (punctuation)
            * misc

        Returns:
            CharacterSet: The character set.

        """
        return JapaneseUnicodes(charset='all')
Exemple #3
0
    def __init__(self,
                 data_dir,
                 test_split='hnsd00000',
                 dev_split=0.1,
                 dev_factor=1,
                 vocab_size=None,
                 min_freq=0,
                 reserved=('<PAD>', '<GO>', '<END>', '<UNK>'),
                 charset=JapaneseUnicodes('all'),
                 image_scope='char',
                 seq_len=None,
                 seq_maxlen=None,
                 verbose=False,
                 seed=None):
        """Initializer.

        Args:
            data_dir (str): Top level directory containing directories
                for each bibliography (e.g. 200003076, hnsd00000).
            test_split (float or str): Either the ratio of all data
                to use for testing or specific bibliography ID(s). Use
                comma-separated IDs for multiple books.
            dev_split (float or str): Either the ratio of all data
                to use for dev/val or specific bibliography ID(s). Use
                comma-separated IDs for multiple books.
            dev_factor (int): Size of development set should be
                divisible by this value. Useful for training on
                multiple GPUs.
            vocab_size (None or int): Maximum size of the vocabulary.
                If None, include all possible characters, minus those
                filtered out after applying min_freq.
            min_freq (int): Minimum frequency of tokens in vocab.
            reserved (tuple): Strings for reserved tokens,
                e.g. ("<PAD>", "<S>", "<UNK>", "</S>").
                Note: Indices of token in given tuple will be used
                for its corresponding integer ID.
            charset (CharacterSet): The character set.
            image_scope (str): Image problem scope.
            seq_len (None or int): (Minimum) number of characters to
                include in image if image_scope is 'seq'. If seq_maxlen
                is None, specifies the deterministic sequence length.
            seq_maxlen (None or int): Maximum sequence length.
            verbose (bool): Display verbose output.
            seed (int): Number for seeding random number generator.
                If None, the DEFAULT_SEED is used.

        """
        # for bib in get_books_list('pmjtc'):
        #     assert bib in os.listdir(data_dir), (
        #         "Expected directory {} in data_dir {}. Check path or "
        #         "download data with 'download-pmjtc'".format(bib, data_dir),
        #     )
        if image_scope == 'seq':
            assert seq_len and seq_len > 0, "seq_len must be positive."
            if seq_maxlen:
                assert seq_maxlen >= seq_len, ("{} < {}".format(
                    seq_maxlen, seq_len))
        assert image_scope in ['char', 'seq', 'page', 'line']
        self.data_dir = data_dir
        self._test_ratio, self._test_heldout = _get_split(test_split)
        self._dev_ratio, self._dev_heldout = _get_split(dev_split)
        self.reserved_tokens = reserved
        self.image_scope = image_scope
        self._dev_factor = dev_factor
        self._C = charset
        self._V = vocab_size
        self._min_freq = min_freq
        self._seq_len = seq_len
        self._seq_maxlen = seq_maxlen if seq_maxlen else seq_len
        if seed is None:
            random.seed(DEFAULT_SEED)
        else:
            random.seed(seed)
        self._verbose = verbose
        self._image_meta = {}
        self._build_image_lists()
        self._build_vocabulary()
Exemple #4
0
 def character_set(self):
     return JapaneseUnicodes('kana')
Exemple #5
0
 def test_misc_charset(self):
     misc = JapaneseUnicodes('misc')
     self.assertTrue(misc.in_charset('ff00'))
     self.assertFalse(misc.in_charset('U+30ff'))
Exemple #6
0
 def test_punctuation_charset(self):
     punctuation = JapaneseUnicodes('punct')
     self.assertTrue(punctuation.in_charset('U+303f'))
     self.assertFalse(punctuation.in_charset('U+3040'))
Exemple #7
0
 def test_kana_and_kanji_charset(self):
     both = JapaneseUnicodes('kana+kanji')
     self.assertTrue(both.in_charset('U+30CF'))
     self.assertTrue(both.in_charset('80FD'))
     self.assertFalse(both.in_charset('U+3002'))
Exemple #8
0
 def test_kanji_charset(self):
     kanji = JapaneseUnicodes('kanji')
     self.assertTrue(kanji.in_charset('U+80FD'))
     self.assertFalse(kanji.in_charset('U+3074'))
Exemple #9
0
 def test_kana_charset(self):
     kana = JapaneseUnicodes('kana')
     self.assertTrue(kana.in_charset('U+3074'))
     self.assertTrue(kana.in_charset('30cf'))
     self.assertFalse(kana.in_charset('U+80FD'))