Example #1
0
 def __init__(self):
     """
     Transliteration of Thai words
     Now supports Thai to Latin (romanization)
     """
     # Download the model, if it's not on your machine.
     self.__filemodel = get_corpus_path("thai2rom-pytorch")
     if not self.__filemodel:
         download("thai2rom-pytorch")
         self.__filemodel = get_corpus_path("thai2rom-pytorch")
     loader = torch.load(self.__filemodel)
     self._n_h = 64  # hidden dimensions for encoder
     self._n_s = 64  # hidden dimensions for decoder
     self._emb_dim = 64  # character embedding size
     self._maxlength = 100
     self._char_to_ix = loader['char_to_ix']
     self._ix_to_char = loader['ix_to_char']
     self._target_char_to_ix = loader['target_char_to_ix']
     self._ix_to_target_char = loader['ix_to_target_char']
     # encoder/ decoder
     # Restore the model and construct the encoder and decoder.
     self._encoder = Encoder(len(self._char_to_ix), self._n_h,
                             self._emb_dim).to(device)
     self._encoder.load_state_dict(loader['encoder_state_dict'])
     self._decoder = OneStepDecoder(len(self._target_char_to_ix), self._n_s,
                                    self._emb_dim).to(device)
     self._decoder.load_state_dict(loader['decoder_state_dict'])
Example #2
0
def _download_install(name: str) -> None:
    if get_corpus_path(name) is None:
        download(name, force=True, version="1.0")
        tar = tarfile.open(get_corpus_path(name), "r:gz")
        tar.extractall()
        tar.close()
    if not os.path.exists(get_full_data_path(name)):
        os.mkdir(get_full_data_path(name))
        with tarfile.open(get_corpus_path(name)) as tar:
            tar.extractall(path=get_full_data_path(name))
Example #3
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
Example #4
0
 def __init__(self):
     super().__init__()
     self.graphemes = hp.graphemes
     self.phonemes = hp.phonemes
     self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
     self.checkpoint = get_corpus_path(_MODEL_NAME)
     if self.checkpoint is None:
         download(_MODEL_NAME)
         self.checkpoint = get_corpus_path(_MODEL_NAME)
     self._load_variables()
Example #5
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     download("test")
     self.assertIsNotNone(remove("test"))
     self.assertIsNotNone(remove("tnc_freq"))
Example #6
0
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
Example #7
0
def _get_path(fname: str) -> str:
    """
    :meth: download get path of file from pythainlp-corpus
    :param str fname: file name
    :return: path to downloaded file
    """
    path = get_corpus_path(fname)
    if not path:
        download(fname)
        path = get_corpus_path(fname)
    return path
Example #8
0
 def __init__(self):
     """
     Thai NER
     """
     self.data_path = get_file('thainer')
     if self.data_path==None:
         download('thainer')
         self.data_path = get_file('thainer')
     self.crf=sklearn_crfsuite.CRF(
         algorithm='lbfgs',
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.data_path)
Example #9
0
 def test_corpus(self):
     self.assertIsNotNone(countries())
     self.assertIsNotNone(provinces())
     self.assertIsNotNone(thai_negations())
     self.assertIsNotNone(thai_stopwords())
     self.assertIsNotNone(thai_syllables())
     self.assertIsNotNone(thai_words())
     self.assertIsNotNone(thai_female_names())
     self.assertIsNotNone(thai_male_names())
     self.assertEqual(get_corpus_db_detail("XXX"), {})
     self.assertIsNone(download("test"))
     self.assertIsNone(download("test", force=True))
     self.assertIsNotNone(get_corpus_db_detail("test"))
     self.assertIsNotNone(remove("test"))
     self.assertFalse(remove("test"))
Example #10
0
 def __init__(self):
     """
     Thai named-entity recognizer
     """
     self.__data_path = get_corpus_path("thainer")
     if not self.__data_path:
         download("thainer")
         self.__data_path = get_corpus_path("thainer")
     self.crf = sklearn_crfsuite.CRF(
         algorithm="lbfgs",
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.__data_path,
     )
Example #11
0
 def __init__(self):
     """
     Thai named-entity recognizer
     """
     self.__data_path = get_corpus_path("thainer-1-3")
     if not self.__data_path:
         download("thainer-1-3")
         self.__data_path = get_corpus_path("thainer-1-3")
     self.crf = sklearn_crfsuite.CRF(
         algorithm="lbfgs",
         c1=0.1,
         c2=0.1,
         max_iterations=500,
         all_possible_transitions=True,
         model_filename=self.__data_path,
     )
Example #12
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        # Download the model, if it's not on your machine.
        self.__filemodel = get_corpus_path("thai2rom-pytorch-attn")
        if not self.__filemodel:
            download("thai2rom-pytorch-attn")
            self.__filemodel = get_corpus_path("thai2rom-pytorch-attn")

        loader = torch.load(self.__filemodel, map_location=device)

        INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"]
        OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"]

        self._maxlength = 100

        self._char_to_ix = loader["char_to_ix"]
        self._ix_to_char = loader["ix_to_char"]
        self._target_char_to_ix = loader["target_char_to_ix"]
        self._ix_to_target_char = loader["ix_to_target_char"]

        # encoder/ decoder
        # Restore the model and construct the encoder and decoder.
        self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT)

        self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM,
                                         D_DROPOUT)

        self._network = Seq2Seq(
            self._encoder,
            self._decoder,
            self._target_char_to_ix["<start>"],
            self._target_char_to_ix["<end>"],
            self._maxlength,
        ).to(device)

        self._network.load_state_dict(loader["model_state_dict"])
        self._network.eval()
Example #13
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertEqual(get_corpus_db_detail("XXX"),
                         {})  # corpus does not exist
        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertTrue(download(name="test", version="0.1"))
        self.assertTrue(remove("test"))
Example #14
0
 def get(self, argv):
     parser = argparse.ArgumentParser(
         description="Download a dataset",
         usage="thainlp data get <dataset_name>",
     )
     parser.add_argument(
         "dataset_name",
         type=str,
         help="dataset/corpus's name",
     )
     args = parser.parse_args(argv[3:])
     if corpus.download(args.dataset_name):
         print("Downloaded successfully.")
     else:
         print("Not found.")
Example #15
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertTrue(download(name="test", version="0.1"))
        self.assertTrue(remove("test"))
Example #16
0
    def test_corpus(self):
        self.assertIsInstance(thai_negations(), frozenset)
        self.assertIsInstance(thai_stopwords(), frozenset)
        self.assertIsInstance(thai_syllables(), frozenset)
        self.assertIsInstance(thai_words(), frozenset)

        self.assertIsInstance(countries(), frozenset)
        self.assertIsInstance(provinces(), frozenset)
        self.assertIsInstance(provinces(details=True), list)
        self.assertEqual(len(provinces(details=False)),
                         len(provinces(details=True)))
        self.assertIsInstance(thai_family_names(), frozenset)
        self.assertIsInstance(list(thai_family_names())[0], str)
        self.assertIsInstance(thai_female_names(), frozenset)
        self.assertIsInstance(thai_male_names(), frozenset)

        self.assertIsInstance(
            get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"),
            Response,
        )  # URL does not exist, should get 404 response
        self.assertIsNone(get_corpus_db("XXXlkja3sfdXX"))  # Invalid URL

        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"),
                         {})  # corpus does not exist
        self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"),
                         {})  # corpus does not exist

        self.assertTrue(download("test"))  # download the first time
        self.assertTrue(download(name="test", force=True))  # force download
        self.assertTrue(download(name="test"))  # try download existing
        self.assertFalse(download(name="test",
                                  url="wrongurl"))  # URL not exist
        self.assertFalse(
            download(name="XxxXXxxx817d37sf"))  # corpus name not exist
        self.assertIsNotNone(get_corpus_db_detail("test"))  # corpus exists
        self.assertIsNotNone(get_corpus_path("test"))  # corpus exists
        self.assertIsNone(get_corpus_default_db("test"))
        self.assertIsNotNone(get_corpus_default_db("thainer", "1.5.1"))
        self.assertIsNotNone(get_corpus_default_db("thainer"))
        self.assertIsNone(get_corpus_default_db("thainer", "1.2"))
        self.assertTrue(remove("test"))  # remove existing
        self.assertFalse(remove("test"))  # remove non-existing
        self.assertIsNone(get_corpus_path("XXXkdjfBzc"))  # query non-existing
        self.assertFalse(download(name="test", version="0.0"))
        self.assertFalse(download(name="test", version="0.0.0"))
        self.assertFalse(download(name="test", version="0.0.1"))
        self.assertFalse(download(name="test", version="0.0.2"))
        self.assertFalse(download(name="test", version="0.0.3"))
        self.assertFalse(download(name="test", version="0.0.4"))
        self.assertIsNotNone(download(name="test", version="0.0.5"))
        self.assertTrue(download("test"))
        self.assertIsNotNone(remove("test"))  # remove existing
        self.assertIsNotNone(download(name="test", version="0.0.6"))
        self.assertIsNotNone(download(name="test", version="0.0.7"))
        self.assertIsNotNone(download(name="test", version="0.0.8"))
        self.assertIsNotNone(download(name="test", version="0.0.9"))
        self.assertIsNotNone(download(name="test", version="0.0.10"))
        with self.assertRaises(Exception) as context:
            self.assertIsNotNone(download(name="test", version="0.0.11"))
        self.assertTrue(
            "Hash does not match expected." in str(context.exception))
        self.assertIsNotNone(download(name="test", version="0.1"))
        self.assertIsNotNone(remove("test"))
Example #17
0
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path==None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')
        self.input_texts = []
        self.target_texts = []
        self.input_characters = set()
        self.target_characters = set()
        with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:
            self.lines = self.f.read().split('\n')
        for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]:
            self.input_text, self.target_text = self.line.split('\t')
            if len(self.input_text)<30 and len(self.target_text)<90:
                self.target_text = '\t' + self.target_text + '\n'
                self.input_texts.append(self.input_text)
                self.target_texts.append(self.target_text)
                for self.char in self.input_text:
                    if self.char not in self.input_characters:
                        self.input_characters.add(self.char)
                for self.char in self.target_text:
                    if self.char not in self.target_characters:
                        self.target_characters.add(self.char)
        self.input_characters = sorted(list(self.input_characters))
        self.target_characters = sorted(list(self.target_characters))
        self.num_encoder_tokens = len(self.input_characters)
        self.num_decoder_tokens = len(self.target_characters)
        self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts])
        self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts])
        '''print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)'''
        self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)])
        self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)])
        self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')
        for i, input_text in enumerate(self.input_texts):
            for t, char in enumerate(self.input_text):
                self.encoder_input_data[i, t, self.input_token_index[char]] = 1.
        # Restore the model and construct the encoder and decoder.
        self.filemodel=get_file('thai2rom')
        if self.filemodel==None:
            download('thai2rom')
            self.filemodel=get_file('thai2rom')
        self.model = load_model(self.filemodel)
        self.encoder_inputs = self.model.input[0]   # input_1
        self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output   # lstm_1
        self.encoder_states = [self.state_h_enc, self.state_c_enc]
        self.encoder_model = Model(self.encoder_inputs, self.encoder_states)
        self.decoder_inputs = self.model.input[1]   # input_2
        self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3')
        self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4')
        self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c]
        self.decoder_lstm = self.model.layers[3]
        self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs)
        self.decoder_states = [self.state_h_dec, self.state_c_dec]
        self.decoder_dense = self.model.layers[4]
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states)

        self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())
Example #18
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_samples = 648241
        self.__data_path = get_corpus_path("thai2rom-dataset")
        if not self.__data_path:
            download("thai2rom-dataset")
            self.__data_path = get_corpus_path("thai2rom-dataset")

        self.__input_texts = []
        self.__target_texts = []
        self.__input_characters = set()
        self.__target_characters = set()

        with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh:
            self.__lines = self.__fh.read().split("\n")

        for line in self.__lines[:min(self.__num_samples,
                                      len(self.__lines) - 1)]:
            input_text, target_text = line.split("\t")
            if len(input_text) < 30 and len(target_text) < 90:
                target_text = "\t" + target_text + "\n"
                self.__input_texts.append(input_text)
                self.__target_texts.append(target_text)
                for char in input_text:
                    if char not in self.__input_characters:
                        self.__input_characters.add(char)
                for char in target_text:
                    if char not in self.__target_characters:
                        self.__target_characters.add(char)

        self.__input_characters = sorted(list(self.__input_characters))
        self.__target_characters = sorted(list(self.__target_characters))
        self.__num_encoder_tokens = len(self.__input_characters)
        self.__num_decoder_tokens = len(self.__target_characters)
        self.__max_encoder_seq_length = max(
            [len(text) for text in self.__input_texts])
        self.__max_decoder_seq_length = max(
            [len(text) for text in self.__target_texts])
        """print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)"""
        self.__input_token_index = dict([
            (char, i) for i, char in enumerate(self.__input_characters)
        ])
        self.__target_token_index = dict([
            (char, i) for i, char in enumerate(self.__target_characters)
        ])
        self.__encoder_input_data = np.zeros(
            (
                len(self.__input_texts),
                self.__max_encoder_seq_length,
                self.__num_encoder_tokens,
            ),
            dtype="float32",
        )
        for i, input_text in enumerate(self.__input_texts):
            for t, char in enumerate(input_text):
                self.__encoder_input_data[i, t,
                                          self.__input_token_index[char]] = 1.

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom")
        if not self.__filemodel:
            download("thai2rom")
            self.__filemodel = get_corpus_path("thai2rom")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs,
                                     self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ),
                                             name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ),
                                             name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs)
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )

        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items())
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items())
Example #19
0
def get_path(fname):
	path = get_file(fname)
	if path==None:
		download(fname)
		path = get_file(fname)
	return(path)
Example #20
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_samples = 648241
        self.__data_path = get_corpus_path("thai2rom-dataset")
        if not self.__data_path:
            download("thai2rom-dataset")
            self.__data_path = get_corpus_path("thai2rom-dataset")

        self.__input_texts = []
        self.__target_texts = []
        self.__input_characters = set()
        self.__target_characters = set()

        with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh:
            self.__lines = self.__fh.read().split("\n")

        for line in self.__lines[: min(self.__num_samples, len(self.__lines) - 1)]:
            input_text, target_text = line.split("\t")
            if len(input_text) < 30 and len(target_text) < 90:
                target_text = "\t" + target_text + "\n"
                self.__input_texts.append(input_text)
                self.__target_texts.append(target_text)
                for char in input_text:
                    if char not in self.__input_characters:
                        self.__input_characters.add(char)
                for char in target_text:
                    if char not in self.__target_characters:
                        self.__target_characters.add(char)

        self.__input_characters = sorted(list(self.__input_characters))
        self.__target_characters = sorted(list(self.__target_characters))
        self.__num_encoder_tokens = len(self.__input_characters)
        self.__num_decoder_tokens = len(self.__target_characters)
        self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts])
        self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts])
        """print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)"""
        self.__input_token_index = dict(
            [(char, i) for i, char in enumerate(self.__input_characters)]
        )
        self.__target_token_index = dict(
            [(char, i) for i, char in enumerate(self.__target_characters)]
        )
        self.__encoder_input_data = np.zeros(
            (
                len(self.__input_texts),
                self.__max_encoder_seq_length,
                self.__num_encoder_tokens,
            ),
            dtype="float32",
        )
        for i, input_text in enumerate(self.__input_texts):
            for t, char in enumerate(input_text):
                self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1.

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom")
        if not self.__filemodel:
            download("thai2rom")
            self.__filemodel = get_corpus_path("thai2rom")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2
        ].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim,), name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim,), name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs
        )
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )

        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items()
        )
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items()
        )
Example #21
0
 def download(args):
     corpus.download(args.name)
Example #22
0
def get_path(fname):
    path = get_file(fname)
    if not path:
        download(fname)
        path = get_file(fname)
    return path
Example #23
0
    def __init__(self):
        '''
        Thai2Rom
        '''
        self.batch_size = 64
        self.epochs = 100
        self.latent_dim = 256
        self.num_samples = 648241
        self.data_path = get_file('thai2rom-dataset')
        if self.data_path == None:
            download('thai2rom-dataset')
            self.data_path = get_file('thai2rom-dataset')
        self.input_texts = []
        self.target_texts = []
        self.input_characters = set()
        self.target_characters = set()
        with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:
            self.lines = self.f.read().split('\n')
        for self.line in self.lines[:min(self.num_samples,
                                         len(self.lines) - 1)]:
            self.input_text, self.target_text = self.line.split('\t')
            if len(self.input_text) < 30 and len(self.target_text) < 90:
                self.target_text = '\t' + self.target_text + '\n'
                self.input_texts.append(self.input_text)
                self.target_texts.append(self.target_text)
                for self.char in self.input_text:
                    if self.char not in self.input_characters:
                        self.input_characters.add(self.char)
                for self.char in self.target_text:
                    if self.char not in self.target_characters:
                        self.target_characters.add(self.char)
        self.input_characters = sorted(list(self.input_characters))
        self.target_characters = sorted(list(self.target_characters))
        self.num_encoder_tokens = len(self.input_characters)
        self.num_decoder_tokens = len(self.target_characters)
        self.max_encoder_seq_length = max(
            [len(self.txt) for self.txt in self.input_texts])
        self.max_decoder_seq_length = max(
            [len(self.txt) for self.txt in self.target_texts])
        '''print('Number of samples:', len(self.input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_decoder_seq_length)'''
        self.input_token_index = dict([
            (char, i) for i, char in enumerate(self.input_characters)
        ])
        self.target_token_index = dict([
            (char, i) for i, char in enumerate(self.target_characters)
        ])
        self.encoder_input_data = np.zeros(
            (len(self.input_texts), self.max_encoder_seq_length,
             self.num_encoder_tokens),
            dtype='float32')
        for i, input_text in enumerate(self.input_texts):
            for t, char in enumerate(self.input_text):
                self.encoder_input_data[i, t,
                                        self.input_token_index[char]] = 1.
        # Restore the model and construct the encoder and decoder.
        self.filemodel = get_file('thai2rom')
        if self.filemodel == None:
            download('thai2rom')
            self.filemodel = get_file('thai2rom')
        self.model = load_model(self.filemodel)
        self.encoder_inputs = self.model.input[0]  # input_1
        self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[
            2].output  # lstm_1
        self.encoder_states = [self.state_h_enc, self.state_c_enc]
        self.encoder_model = Model(self.encoder_inputs, self.encoder_states)
        self.decoder_inputs = self.model.input[1]  # input_2
        self.decoder_state_input_h = Input(shape=(self.latent_dim, ),
                                           name='input_3')
        self.decoder_state_input_c = Input(shape=(self.latent_dim, ),
                                           name='input_4')
        self.decoder_states_inputs = [
            self.decoder_state_input_h, self.decoder_state_input_c
        ]
        self.decoder_lstm = self.model.layers[3]
        self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(
            self.decoder_inputs, initial_state=self.decoder_states_inputs)
        self.decoder_states = [self.state_h_dec, self.state_c_dec]
        self.decoder_dense = self.model.layers[4]
        self.decoder_outputs = self.decoder_dense(self.decoder_outputs)
        self.decoder_model = Model(
            [self.decoder_inputs] + self.decoder_states_inputs,
            [self.decoder_outputs] + self.decoder_states)

        self.reverse_input_char_index = dict(
            (i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict(
            (i, char) for char, i in self.target_token_index.items())
def get_path(fname):
    path = get_file(fname)
    if path == None:
        download(fname)
        path = get_file(fname)
    return (path)
Example #25
0
    def __init__(self):
        """
        Transliteration of Thai words
        Now supports Thai to Latin (romanization)
        """
        self.__input_token_index = {
            ' ': 0,
            '!': 1,
            '"': 2,
            '(': 3,
            ')': 4,
            '-': 5,
            '.': 6,
            '0': 7,
            '1': 8,
            '2': 9,
            '3': 10,
            '4': 11,
            '5': 12,
            '6': 13,
            '7': 14,
            '8': 15,
            '9': 16,
            '\xa0': 17,
            'ก': 18,
            'ข': 19,
            'ฃ': 20,
            'ค': 21,
            'ฅ': 22,
            'ฆ': 23,
            'ง': 24,
            'จ': 25,
            'ฉ': 26,
            'ช': 27,
            'ซ': 28,
            'ฌ': 29,
            'ญ': 30,
            'ฎ': 31,
            'ฏ': 32,
            'ฐ': 33,
            'ฑ': 34,
            'ฒ': 35,
            'ณ': 36,
            'ด': 37,
            'ต': 38,
            'ถ': 39,
            'ท': 40,
            'ธ': 41,
            'น': 42,
            'บ': 43,
            'ป': 44,
            'ผ': 45,
            'ฝ': 46,
            'พ': 47,
            'ฟ': 48,
            'ภ': 49,
            'ม': 50,
            'ย': 51,
            'ร': 52,
            'ฤ': 53,
            'ล': 54,
            'ฦ': 55,
            'ว': 56,
            'ศ': 57,
            'ษ': 58,
            'ส': 59,
            'ห': 60,
            'ฬ': 61,
            'อ': 62,
            'ฮ': 63,
            'ฯ': 64,
            'ะ': 65,
            'ั': 66,
            'า': 67,
            'ำ': 68,
            'ิ': 69,
            'ี': 70,
            'ึ': 71,
            'ื': 72,
            'ุ': 73,
            'ู': 74,
            'ฺ': 75,
            'เ': 76,
            'แ': 77,
            'โ': 78,
            'ใ': 79,
            'ไ': 80,
            'ๅ': 81,
            'ๆ': 82,
            '็': 83,
            '่': 84,
            '้': 85,
            '๊': 86,
            '๋': 87,
            '์': 88,
            'ํ': 89,
            '๙': 90
        }
        self.__target_token_index = {
            '\t': 0,
            '\n': 1,
            ' ': 2,
            '!': 3,
            '"': 4,
            '(': 5,
            ')': 6,
            '-': 7,
            '0': 8,
            '1': 9,
            '2': 10,
            '3': 11,
            '4': 12,
            '5': 13,
            '6': 14,
            '7': 15,
            '8': 16,
            '9': 17,
            'a': 18,
            'b': 19,
            'c': 20,
            'd': 21,
            'e': 22,
            'f': 23,
            'g': 24,
            'h': 25,
            'i': 26,
            'k': 27,
            'l': 28,
            'm': 29,
            'n': 30,
            'o': 31,
            'p': 32,
            'r': 33,
            's': 34,
            't': 35,
            'u': 36,
            'w': 37,
            'y': 38
        }
        self.__reverse_input_char_index = dict(
            (i, char) for char, i in self.__input_token_index.items())
        self.__reverse_target_char_index = dict(
            (i, char) for char, i in self.__target_token_index.items())
        self.__batch_size = 64
        self.__epochs = 100
        self.__latent_dim = 256
        self.__num_encoder_tokens = 91
        self.__num_decoder_tokens = 39
        self.__max_encoder_seq_length = 20
        self.__max_decoder_seq_length = 22

        # Restore the model and construct the encoder and decoder.
        self.__filemodel = get_corpus_path("thai2rom-v2")
        if not self.__filemodel:
            download("thai2rom-v2")
            self.__filemodel = get_corpus_path("thai2rom-v2")
        self.__model = load_model(self.__filemodel)
        self.__encoder_inputs = self.__model.input[0]  # input_1
        self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[
            2].output  # lstm_1
        self.__encoder_states = [self.__state_h_enc, self.__state_c_enc]
        self.__encoder_model = Model(self.__encoder_inputs,
                                     self.__encoder_states)
        self.__decoder_inputs = self.__model.input[1]  # input_2
        self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ),
                                             name="input_3")
        self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ),
                                             name="input_4")
        self.__decoder_states_inputs = [
            self.__decoder_state_input_h,
            self.__decoder_state_input_c,
        ]
        self.__decoder_lstm = self.__model.layers[3]
        self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm(
            self.__decoder_inputs, initial_state=self.__decoder_states_inputs)
        self.__decoder_states = [self.__state_h_dec, self.__state_c_dec]
        self.__decoder_dense = self.__model.layers[4]
        self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs)
        self.__decoder_model = Model(
            [self.__decoder_inputs] + self.__decoder_states_inputs,
            [self.__decoder_outputs] + self.__decoder_states,
        )