def test_all_charset(self): all_set = JapaneseUnicodes('all') self.assertTrue(all_set.in_charset('fF00')) self.assertTrue(all_set.in_charset('U+303f')) self.assertTrue(all_set.in_charset('80FD')) self.assertTrue(all_set.in_charset('U+3074')) self.assertTrue(all_set.in_charset('30cf')) self.assertFalse(all_set.in_charset('U+1708'))
def character_set(self): """The Japanese characters (e.g. kana, kanji) of interest. Preset character sets may include the following component sets: * hiragana * katakana * kana * kanji * punct (punctuation) * misc Returns: CharacterSet: The character set. """ return JapaneseUnicodes(charset='all')
def __init__(self, data_dir, test_split='hnsd00000', dev_split=0.1, dev_factor=1, vocab_size=None, min_freq=0, reserved=('<PAD>', '<GO>', '<END>', '<UNK>'), charset=JapaneseUnicodes('all'), image_scope='char', seq_len=None, seq_maxlen=None, verbose=False, seed=None): """Initializer. Args: data_dir (str): Top level directory containing directories for each bibliography (e.g. 200003076, hnsd00000). test_split (float or str): Either the ratio of all data to use for testing or specific bibliography ID(s). Use comma-separated IDs for multiple books. dev_split (float or str): Either the ratio of all data to use for dev/val or specific bibliography ID(s). Use comma-separated IDs for multiple books. dev_factor (int): Size of development set should be divisible by this value. Useful for training on multiple GPUs. vocab_size (None or int): Maximum size of the vocabulary. If None, include all possible characters, minus those filtered out after applying min_freq. min_freq (int): Minimum frequency of tokens in vocab. reserved (tuple): Strings for reserved tokens, e.g. ("<PAD>", "<S>", "<UNK>", "</S>"). Note: Indices of token in given tuple will be used for its corresponding integer ID. charset (CharacterSet): The character set. image_scope (str): Image problem scope. seq_len (None or int): (Minimum) number of characters to include in image if image_scope is 'seq'. If seq_maxlen is None, specifies the deterministic sequence length. seq_maxlen (None or int): Maximum sequence length. verbose (bool): Display verbose output. seed (int): Number for seeding random number generator. If None, the DEFAULT_SEED is used. """ # for bib in get_books_list('pmjtc'): # assert bib in os.listdir(data_dir), ( # "Expected directory {} in data_dir {}. Check path or " # "download data with 'download-pmjtc'".format(bib, data_dir), # ) if image_scope == 'seq': assert seq_len and seq_len > 0, "seq_len must be positive." if seq_maxlen: assert seq_maxlen >= seq_len, ("{} < {}".format( seq_maxlen, seq_len)) assert image_scope in ['char', 'seq', 'page', 'line'] self.data_dir = data_dir self._test_ratio, self._test_heldout = _get_split(test_split) self._dev_ratio, self._dev_heldout = _get_split(dev_split) self.reserved_tokens = reserved self.image_scope = image_scope self._dev_factor = dev_factor self._C = charset self._V = vocab_size self._min_freq = min_freq self._seq_len = seq_len self._seq_maxlen = seq_maxlen if seq_maxlen else seq_len if seed is None: random.seed(DEFAULT_SEED) else: random.seed(seed) self._verbose = verbose self._image_meta = {} self._build_image_lists() self._build_vocabulary()
def character_set(self): return JapaneseUnicodes('kana')
def test_misc_charset(self): misc = JapaneseUnicodes('misc') self.assertTrue(misc.in_charset('ff00')) self.assertFalse(misc.in_charset('U+30ff'))
def test_punctuation_charset(self): punctuation = JapaneseUnicodes('punct') self.assertTrue(punctuation.in_charset('U+303f')) self.assertFalse(punctuation.in_charset('U+3040'))
def test_kana_and_kanji_charset(self): both = JapaneseUnicodes('kana+kanji') self.assertTrue(both.in_charset('U+30CF')) self.assertTrue(both.in_charset('80FD')) self.assertFalse(both.in_charset('U+3002'))
def test_kanji_charset(self): kanji = JapaneseUnicodes('kanji') self.assertTrue(kanji.in_charset('U+80FD')) self.assertFalse(kanji.in_charset('U+3074'))
def test_kana_charset(self): kana = JapaneseUnicodes('kana') self.assertTrue(kana.in_charset('U+3074')) self.assertTrue(kana.in_charset('30cf')) self.assertFalse(kana.in_charset('U+80FD'))