def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ # Download the model, if it's not on your machine. self.__filemodel = get_corpus_path("thai2rom-pytorch") if not self.__filemodel: download("thai2rom-pytorch") self.__filemodel = get_corpus_path("thai2rom-pytorch") loader = torch.load(self.__filemodel) self._n_h = 64 # hidden dimensions for encoder self._n_s = 64 # hidden dimensions for decoder self._emb_dim = 64 # character embedding size self._maxlength = 100 self._char_to_ix = loader['char_to_ix'] self._ix_to_char = loader['ix_to_char'] self._target_char_to_ix = loader['target_char_to_ix'] self._ix_to_target_char = loader['ix_to_target_char'] # encoder/ decoder # Restore the model and construct the encoder and decoder. self._encoder = Encoder(len(self._char_to_ix), self._n_h, self._emb_dim).to(device) self._encoder.load_state_dict(loader['encoder_state_dict']) self._decoder = OneStepDecoder(len(self._target_char_to_ix), self._n_s, self._emb_dim).to(device) self._decoder.load_state_dict(loader['decoder_state_dict'])
def _download_install(name: str) -> None: if get_corpus_path(name) is None: download(name, force=True, version="1.0") tar = tarfile.open(get_corpus_path(name), "r:gz") tar.extractall() tar.close() if not os.path.exists(get_full_data_path(name)): os.mkdir(get_full_data_path(name)) with tarfile.open(get_corpus_path(name)) as tar: tar.extractall(path=get_full_data_path(name))
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) download("test") self.assertIsNotNone(remove("test")) self.assertIsNotNone(remove("tnc_freq"))
def __init__(self): super().__init__() self.graphemes = hp.graphemes self.phonemes = hp.phonemes self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab() self.checkpoint = get_corpus_path(_MODEL_NAME) if self.checkpoint is None: download(_MODEL_NAME) self.checkpoint = get_corpus_path(_MODEL_NAME) self._load_variables()
def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name :return: path to downloaded file """ path = get_corpus_path(fname) if not path: download(fname) path = get_corpus_path(fname) return path
def __init__(self): """ Thai NER """ self.data_path = get_file('thainer') if self.data_path==None: download('thainer') self.data_path = get_file('thainer') self.crf=sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.data_path)
def test_corpus(self): self.assertIsNotNone(countries()) self.assertIsNotNone(provinces()) self.assertIsNotNone(thai_negations()) self.assertIsNotNone(thai_stopwords()) self.assertIsNotNone(thai_syllables()) self.assertIsNotNone(thai_words()) self.assertIsNotNone(thai_female_names()) self.assertIsNotNone(thai_male_names()) self.assertEqual(get_corpus_db_detail("XXX"), {}) self.assertIsNone(download("test")) self.assertIsNone(download("test", force=True)) self.assertIsNotNone(get_corpus_db_detail("test")) self.assertIsNotNone(remove("test")) self.assertFalse(remove("test"))
def __init__(self): """ Thai named-entity recognizer """ self.__data_path = get_corpus_path("thainer") if not self.__data_path: download("thainer") self.__data_path = get_corpus_path("thainer") self.crf = sklearn_crfsuite.CRF( algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.__data_path, )
def __init__(self): """ Thai named-entity recognizer """ self.__data_path = get_corpus_path("thainer-1-3") if not self.__data_path: download("thainer-1-3") self.__data_path = get_corpus_path("thainer-1-3") self.crf = sklearn_crfsuite.CRF( algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.__data_path, )
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ # Download the model, if it's not on your machine. self.__filemodel = get_corpus_path("thai2rom-pytorch-attn") if not self.__filemodel: download("thai2rom-pytorch-attn") self.__filemodel = get_corpus_path("thai2rom-pytorch-attn") loader = torch.load(self.__filemodel, map_location=device) INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"] OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"] self._maxlength = 100 self._char_to_ix = loader["char_to_ix"] self._ix_to_char = loader["ix_to_char"] self._target_char_to_ix = loader["target_char_to_ix"] self._ix_to_target_char = loader["ix_to_target_char"] # encoder/ decoder # Restore the model and construct the encoder and decoder. self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT) self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT) self._network = Seq2Seq( self._encoder, self._decoder, self._target_char_to_ix["<start>"], self._target_char_to_ix["<end>"], self._maxlength, ).to(device) self._network.load_state_dict(loader["model_state_dict"]) self._network.eval()
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertEqual(get_corpus_db_detail("XXX"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertTrue(download(name="test", version="0.1")) self.assertTrue(remove("test"))
def get(self, argv): parser = argparse.ArgumentParser( description="Download a dataset", usage="thainlp data get <dataset_name>", ) parser.add_argument( "dataset_name", type=str, help="dataset/corpus's name", ) args = parser.parse_args(argv[3:]) if corpus.download(args.dataset_name): print("Downloaded successfully.") else: print("Not found.")
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertTrue(download(name="test", version="0.1")) self.assertTrue(remove("test"))
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_family_names(), frozenset) self.assertIsInstance(list(thai_family_names())[0], str) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertIsNone(get_corpus_default_db("test")) self.assertIsNotNone(get_corpus_default_db("thainer", "1.5.1")) self.assertIsNotNone(get_corpus_default_db("thainer")) self.assertIsNone(get_corpus_default_db("thainer", "1.2")) self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertFalse(download(name="test", version="0.0")) self.assertFalse(download(name="test", version="0.0.0")) self.assertFalse(download(name="test", version="0.0.1")) self.assertFalse(download(name="test", version="0.0.2")) self.assertFalse(download(name="test", version="0.0.3")) self.assertFalse(download(name="test", version="0.0.4")) self.assertIsNotNone(download(name="test", version="0.0.5")) self.assertTrue(download("test")) self.assertIsNotNone(remove("test")) # remove existing self.assertIsNotNone(download(name="test", version="0.0.6")) self.assertIsNotNone(download(name="test", version="0.0.7")) self.assertIsNotNone(download(name="test", version="0.0.8")) self.assertIsNotNone(download(name="test", version="0.0.9")) self.assertIsNotNone(download(name="test", version="0.0.10")) with self.assertRaises(Exception) as context: self.assertIsNotNone(download(name="test", version="0.0.11")) self.assertTrue( "Hash does not match expected." in str(context.exception)) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test"))
def __init__(self): ''' Thai2Rom ''' self.batch_size = 64 self.epochs = 100 self.latent_dim = 256 self.num_samples = 648241 self.data_path = get_file('thai2rom-dataset') if self.data_path==None: download('thai2rom-dataset') self.data_path = get_file('thai2rom-dataset') self.input_texts = [] self.target_texts = [] self.input_characters = set() self.target_characters = set() with open(self.data_path, 'r', encoding='utf-8-sig') as self.f: self.lines = self.f.read().split('\n') for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]: self.input_text, self.target_text = self.line.split('\t') if len(self.input_text)<30 and len(self.target_text)<90: self.target_text = '\t' + self.target_text + '\n' self.input_texts.append(self.input_text) self.target_texts.append(self.target_text) for self.char in self.input_text: if self.char not in self.input_characters: self.input_characters.add(self.char) for self.char in self.target_text: if self.char not in self.target_characters: self.target_characters.add(self.char) self.input_characters = sorted(list(self.input_characters)) self.target_characters = sorted(list(self.target_characters)) self.num_encoder_tokens = len(self.input_characters) self.num_decoder_tokens = len(self.target_characters) self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts]) self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts]) '''print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)''' self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)]) self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)]) self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32') for i, input_text in enumerate(self.input_texts): for t, char in enumerate(self.input_text): self.encoder_input_data[i, t, self.input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.filemodel=get_file('thai2rom') if self.filemodel==None: download('thai2rom') self.filemodel=get_file('thai2rom') self.model = load_model(self.filemodel) self.encoder_inputs = self.model.input[0] # input_1 self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output # lstm_1 self.encoder_states = [self.state_h_enc, self.state_c_enc] self.encoder_model = Model(self.encoder_inputs, self.encoder_states) self.decoder_inputs = self.model.input[1] # input_2 self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3') self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4') self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c] self.decoder_lstm = self.model.layers[3] self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs) self.decoder_states = [self.state_h_dec, self.state_c_dec] self.decoder_dense = self.model.layers[4] self.decoder_outputs = self.decoder_dense(self.decoder_outputs) self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states) self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items()) self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_samples = 648241 self.__data_path = get_corpus_path("thai2rom-dataset") if not self.__data_path: download("thai2rom-dataset") self.__data_path = get_corpus_path("thai2rom-dataset") self.__input_texts = [] self.__target_texts = [] self.__input_characters = set() self.__target_characters = set() with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh: self.__lines = self.__fh.read().split("\n") for line in self.__lines[:min(self.__num_samples, len(self.__lines) - 1)]: input_text, target_text = line.split("\t") if len(input_text) < 30 and len(target_text) < 90: target_text = "\t" + target_text + "\n" self.__input_texts.append(input_text) self.__target_texts.append(target_text) for char in input_text: if char not in self.__input_characters: self.__input_characters.add(char) for char in target_text: if char not in self.__target_characters: self.__target_characters.add(char) self.__input_characters = sorted(list(self.__input_characters)) self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) self.__max_encoder_seq_length = max( [len(text) for text in self.__input_texts]) self.__max_decoder_seq_length = max( [len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)""" self.__input_token_index = dict([ (char, i) for i, char in enumerate(self.__input_characters) ]) self.__target_token_index = dict([ (char, i) for i, char in enumerate(self.__target_characters) ]) self.__encoder_input_data = np.zeros( ( len(self.__input_texts), self.__max_encoder_seq_length, self.__num_encoder_tokens, ), dtype="float32", ) for i, input_text in enumerate(self.__input_texts): for t, char in enumerate(input_text): self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom") if not self.__filemodel: download("thai2rom") self.__filemodel = get_corpus_path("thai2rom") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, ) self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items()) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items())
def get_path(fname): path = get_file(fname) if path==None: download(fname) path = get_file(fname) return(path)
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_samples = 648241 self.__data_path = get_corpus_path("thai2rom-dataset") if not self.__data_path: download("thai2rom-dataset") self.__data_path = get_corpus_path("thai2rom-dataset") self.__input_texts = [] self.__target_texts = [] self.__input_characters = set() self.__target_characters = set() with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh: self.__lines = self.__fh.read().split("\n") for line in self.__lines[: min(self.__num_samples, len(self.__lines) - 1)]: input_text, target_text = line.split("\t") if len(input_text) < 30 and len(target_text) < 90: target_text = "\t" + target_text + "\n" self.__input_texts.append(input_text) self.__target_texts.append(target_text) for char in input_text: if char not in self.__input_characters: self.__input_characters.add(char) for char in target_text: if char not in self.__target_characters: self.__target_characters.add(char) self.__input_characters = sorted(list(self.__input_characters)) self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts]) self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)""" self.__input_token_index = dict( [(char, i) for i, char in enumerate(self.__input_characters)] ) self.__target_token_index = dict( [(char, i) for i, char in enumerate(self.__target_characters)] ) self.__encoder_input_data = np.zeros( ( len(self.__input_texts), self.__max_encoder_seq_length, self.__num_encoder_tokens, ), dtype="float32", ) for i, input_text in enumerate(self.__input_texts): for t, char in enumerate(input_text): self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom") if not self.__filemodel: download("thai2rom") self.__filemodel = get_corpus_path("thai2rom") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2 ].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim,), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim,), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs ) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, ) self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items() ) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items() )
def download(args): corpus.download(args.name)
def get_path(fname): path = get_file(fname) if not path: download(fname) path = get_file(fname) return path
def __init__(self): ''' Thai2Rom ''' self.batch_size = 64 self.epochs = 100 self.latent_dim = 256 self.num_samples = 648241 self.data_path = get_file('thai2rom-dataset') if self.data_path == None: download('thai2rom-dataset') self.data_path = get_file('thai2rom-dataset') self.input_texts = [] self.target_texts = [] self.input_characters = set() self.target_characters = set() with open(self.data_path, 'r', encoding='utf-8-sig') as self.f: self.lines = self.f.read().split('\n') for self.line in self.lines[:min(self.num_samples, len(self.lines) - 1)]: self.input_text, self.target_text = self.line.split('\t') if len(self.input_text) < 30 and len(self.target_text) < 90: self.target_text = '\t' + self.target_text + '\n' self.input_texts.append(self.input_text) self.target_texts.append(self.target_text) for self.char in self.input_text: if self.char not in self.input_characters: self.input_characters.add(self.char) for self.char in self.target_text: if self.char not in self.target_characters: self.target_characters.add(self.char) self.input_characters = sorted(list(self.input_characters)) self.target_characters = sorted(list(self.target_characters)) self.num_encoder_tokens = len(self.input_characters) self.num_decoder_tokens = len(self.target_characters) self.max_encoder_seq_length = max( [len(self.txt) for self.txt in self.input_texts]) self.max_decoder_seq_length = max( [len(self.txt) for self.txt in self.target_texts]) '''print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)''' self.input_token_index = dict([ (char, i) for i, char in enumerate(self.input_characters) ]) self.target_token_index = dict([ (char, i) for i, char in enumerate(self.target_characters) ]) self.encoder_input_data = np.zeros( (len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens), dtype='float32') for i, input_text in enumerate(self.input_texts): for t, char in enumerate(self.input_text): self.encoder_input_data[i, t, self.input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.filemodel = get_file('thai2rom') if self.filemodel == None: download('thai2rom') self.filemodel = get_file('thai2rom') self.model = load_model(self.filemodel) self.encoder_inputs = self.model.input[0] # input_1 self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[ 2].output # lstm_1 self.encoder_states = [self.state_h_enc, self.state_c_enc] self.encoder_model = Model(self.encoder_inputs, self.encoder_states) self.decoder_inputs = self.model.input[1] # input_2 self.decoder_state_input_h = Input(shape=(self.latent_dim, ), name='input_3') self.decoder_state_input_c = Input(shape=(self.latent_dim, ), name='input_4') self.decoder_states_inputs = [ self.decoder_state_input_h, self.decoder_state_input_c ] self.decoder_lstm = self.model.layers[3] self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm( self.decoder_inputs, initial_state=self.decoder_states_inputs) self.decoder_states = [self.state_h_dec, self.state_c_dec] self.decoder_dense = self.model.layers[4] self.decoder_outputs = self.decoder_dense(self.decoder_outputs) self.decoder_model = Model( [self.decoder_inputs] + self.decoder_states_inputs, [self.decoder_outputs] + self.decoder_states) self.reverse_input_char_index = dict( (i, char) for char, i in self.input_token_index.items()) self.reverse_target_char_index = dict( (i, char) for char, i in self.target_token_index.items())
def get_path(fname): path = get_file(fname) if path == None: download(fname) path = get_file(fname) return (path)
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__input_token_index = { ' ': 0, '!': 1, '"': 2, '(': 3, ')': 4, '-': 5, '.': 6, '0': 7, '1': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '\xa0': 17, 'ก': 18, 'ข': 19, 'ฃ': 20, 'ค': 21, 'ฅ': 22, 'ฆ': 23, 'ง': 24, 'จ': 25, 'ฉ': 26, 'ช': 27, 'ซ': 28, 'ฌ': 29, 'ญ': 30, 'ฎ': 31, 'ฏ': 32, 'ฐ': 33, 'ฑ': 34, 'ฒ': 35, 'ณ': 36, 'ด': 37, 'ต': 38, 'ถ': 39, 'ท': 40, 'ธ': 41, 'น': 42, 'บ': 43, 'ป': 44, 'ผ': 45, 'ฝ': 46, 'พ': 47, 'ฟ': 48, 'ภ': 49, 'ม': 50, 'ย': 51, 'ร': 52, 'ฤ': 53, 'ล': 54, 'ฦ': 55, 'ว': 56, 'ศ': 57, 'ษ': 58, 'ส': 59, 'ห': 60, 'ฬ': 61, 'อ': 62, 'ฮ': 63, 'ฯ': 64, 'ะ': 65, 'ั': 66, 'า': 67, 'ำ': 68, 'ิ': 69, 'ี': 70, 'ึ': 71, 'ื': 72, 'ุ': 73, 'ู': 74, 'ฺ': 75, 'เ': 76, 'แ': 77, 'โ': 78, 'ใ': 79, 'ไ': 80, 'ๅ': 81, 'ๆ': 82, '็': 83, '่': 84, '้': 85, '๊': 86, '๋': 87, '์': 88, 'ํ': 89, '๙': 90 } self.__target_token_index = { '\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '(': 5, ')': 6, '-': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, 'a': 18, 'b': 19, 'c': 20, 'd': 21, 'e': 22, 'f': 23, 'g': 24, 'h': 25, 'i': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'r': 33, 's': 34, 't': 35, 'u': 36, 'w': 37, 'y': 38 } self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items()) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items()) self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_encoder_tokens = 91 self.__num_decoder_tokens = 39 self.__max_encoder_seq_length = 20 self.__max_decoder_seq_length = 22 # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom-v2") if not self.__filemodel: download("thai2rom-v2") self.__filemodel = get_corpus_path("thai2rom-v2") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, )