def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ # Download the model, if it's not on your machine. self.__filemodel = get_corpus_path("thai2rom-pytorch") if not self.__filemodel: download("thai2rom-pytorch") self.__filemodel = get_corpus_path("thai2rom-pytorch") loader = torch.load(self.__filemodel) self._n_h = 64 # hidden dimensions for encoder self._n_s = 64 # hidden dimensions for decoder self._emb_dim = 64 # character embedding size self._maxlength = 100 self._char_to_ix = loader['char_to_ix'] self._ix_to_char = loader['ix_to_char'] self._target_char_to_ix = loader['target_char_to_ix'] self._ix_to_target_char = loader['ix_to_target_char'] # encoder/ decoder # Restore the model and construct the encoder and decoder. self._encoder = Encoder(len(self._char_to_ix), self._n_h, self._emb_dim).to(device) self._encoder.load_state_dict(loader['encoder_state_dict']) self._decoder = OneStepDecoder(len(self._target_char_to_ix), self._n_s, self._emb_dim).to(device) self._decoder.load_state_dict(loader['decoder_state_dict'])
def test_corpus(self): self.assertIsInstance(thai_negations(), frozenset) self.assertIsInstance(thai_stopwords(), frozenset) self.assertIsInstance(thai_syllables(), frozenset) self.assertIsInstance(thai_words(), frozenset) self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) self.assertEqual(len(provinces(details=False)), len(provinces(details=True))) self.assertIsInstance(thai_family_names(), frozenset) self.assertIsInstance(list(thai_family_names())[0], str) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) self.assertIsInstance( get_corpus_db("https://example.com/XXXXXX0lkjasd/SXfmskdjKKXXX"), Response, ) # URL does not exist, should get 404 response self.assertIsNone(get_corpus_db("XXXlkja3sfdXX")) # Invalid URL self.assertEqual(get_corpus_db_detail("XXXmx3KSXX"), {}) # corpus does not exist self.assertEqual(get_corpus_db_detail("XXXmx3KSXX", version="0.2"), {}) # corpus does not exist self.assertTrue(download("test")) # download the first time self.assertTrue(download(name="test", force=True)) # force download self.assertTrue(download(name="test")) # try download existing self.assertFalse(download(name="test", url="wrongurl")) # URL not exist self.assertFalse( download(name="XxxXXxxx817d37sf")) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing self.assertFalse(download(name="test", version="0.0")) self.assertFalse(download(name="test", version="0.0.0")) self.assertFalse(download(name="test", version="0.0.1")) self.assertFalse(download(name="test", version="0.0.2")) self.assertFalse(download(name="test", version="0.0.3")) self.assertFalse(download(name="test", version="0.0.4")) self.assertIsNotNone(download(name="test", version="0.0.5")) self.assertTrue(download("test")) self.assertIsNotNone(remove("test")) # remove existing self.assertIsNotNone(download(name="test", version="0.0.6")) self.assertIsNotNone(download(name="test", version="0.0.7")) self.assertIsNotNone(download(name="test", version="0.0.8")) self.assertIsNotNone(download(name="test", version="0.0.9")) self.assertIsNotNone(download(name="test", version="0.0.10")) with self.assertRaises(Exception) as context: self.assertIsNotNone(download(name="test", version="0.0.11")) self.assertTrue( "Hash does not match expected." in str(context.exception)) self.assertIsNotNone(download(name="test", version="0.1")) self.assertIsNotNone(remove("test"))
def _download_install(name: str) -> None: if get_corpus_path(name) is None: download(name, force=True, version="1.0") tar = tarfile.open(get_corpus_path(name), "r:gz") tar.extractall() tar.close() if not os.path.exists(get_full_data_path(name)): os.mkdir(get_full_data_path(name)) with tarfile.open(get_corpus_path(name)) as tar: tar.extractall(path=get_full_data_path(name))
def __init__(self): super().__init__() self.graphemes = hp.graphemes self.phonemes = hp.phonemes self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab() self.checkpoint = get_corpus_path(_MODEL_NAME) if self.checkpoint is None: download(_MODEL_NAME) self.checkpoint = get_corpus_path(_MODEL_NAME) self._load_variables()
def _get_path(fname: str) -> str: """ :meth: download get path of file from pythainlp-corpus :param str fname: file name :return: path to downloaded file """ path = get_corpus_path(fname) if not path: download(fname) path = get_corpus_path(fname) return path
def _lst20_tagger(): global _LST20_TAGGER if not _LST20_TAGGER: path = get_corpus_path(_LST20_TAGGER_NAME) with open(path, "rb") as fh: _LST20_TAGGER = pickle.load(fh) return _LST20_TAGGER
def __init__(self): """ Thai named-entity recognizer """ self.__data_path = get_corpus_path("thainer") if not self.__data_path: download("thainer") self.__data_path = get_corpus_path("thainer") self.crf = sklearn_crfsuite.CRF( algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.__data_path, )
def __init__(self, version: str = "1.5") -> None: """ Thai named-entity recognizer. :param str version: Thai NER version. It's support Thai NER 1.4 & 1.5. The defualt value is `1.5` """ self.crf = CRFTagger() if version == "1.4": self.crf.open(get_corpus_path("thainer-1.4", version="1.4")) self.pos_tag_name = "orchid_ud" else: self.crf.open(get_corpus_path(_CORPUS_NAME, version="1.5")) self.pos_tag_name = "lst20"
def __init__(self): # get the model, will download if it's not available locally self.__model_filename = get_corpus_path(_MODEL_NAME) loader = torch.load(self.__model_filename, map_location=device) INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"] OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"] self._maxlength = 100 self._char_to_ix = loader["char_to_ix"] self._ix_to_char = loader["ix_to_char"] self._target_char_to_ix = loader["target_char_to_ix"] self._ix_to_target_char = loader["ix_to_target_char"] # encoder/ decoder # Restore the model and construct the encoder and decoder. self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT) self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT) self._network = Seq2Seq( self._encoder, self._decoder, self._target_char_to_ix["<start>"], self._target_char_to_ix["<end>"], self._maxlength, ).to(device) self._network.load_state_dict(loader["model_state_dict"]) self._network.eval()
def __init__(self): """ Thai named-entity recognizer """ self.__data_path = get_corpus_path("thainer-1-3") if not self.__data_path: download("thainer-1-3") self.__data_path = get_corpus_path("thainer-1-3") self.crf = sklearn_crfsuite.CRF( algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.__data_path, )
def _lst20_tagger(): global _LST20_TAGGER if not _LST20_TAGGER: path = get_corpus_path(_LST20_TAGGER_NAME) with open(path, encoding="utf-8-sig") as fh: _LST20_TAGGER = json.load(fh) return _LST20_TAGGER
def get_model() -> Word2VecKeyedVectors: """ Get word vector model. :return: `gensim` word2vec model :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors """ path = get_corpus_path(_MODEL_NAME) return KeyedVectors.load_word2vec_format(path, binary=True)
def trigram_word_freqs() -> defaultdict: """ Get trigram word frequency from Thai National Corpus (TNC) """ _path = get_corpus_path(_TRIGRAM) _word_freqs = defaultdict(int) with open(_path, "r", encoding="utf-8-sig") as fh: for i in fh.readlines(): _temp = i.strip().split(" ") _word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1]) return _word_freqs
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ # Download the model, if it's not on your machine. self.__filemodel = get_corpus_path("thai2rom-pytorch-attn") if not self.__filemodel: download("thai2rom-pytorch-attn") self.__filemodel = get_corpus_path("thai2rom-pytorch-attn") loader = torch.load(self.__filemodel, map_location=device) INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT = loader["encoder_params"] OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT = loader["decoder_params"] self._maxlength = 100 self._char_to_ix = loader["char_to_ix"] self._ix_to_char = loader["ix_to_char"] self._target_char_to_ix = loader["target_char_to_ix"] self._ix_to_target_char = loader["ix_to_target_char"] # encoder/ decoder # Restore the model and construct the encoder and decoder. self._encoder = Encoder(INPUT_DIM, E_EMB_DIM, E_HID_DIM, E_DROPOUT) self._decoder = AttentionDecoder(OUTPUT_DIM, D_EMB_DIM, D_HID_DIM, D_DROPOUT) self._network = Seq2Seq( self._encoder, self._decoder, self._target_char_to_ix["<start>"], self._target_char_to_ix["<end>"], self._maxlength, ).to(device) self._network.load_state_dict(loader["model_state_dict"]) self._network.eval()
def get_model() -> Word2VecKeyedVectors: """ **DEPRECATED: use WordVector class instead** Get word vector model. :return: `gensim` word2vec model :rtype: gensim.models.keyedvectors.Word2VecKeyedVectors """ warnings.warn( "get_model is deprecated, use WordVector class instead", DeprecationWarning, ) path = get_corpus_path(_MODEL_NAME) return KeyedVectors.load_word2vec_format(path, binary=True)
def unigram_word_freqs() -> defaultdict: """ Get unigram word frequency from OSCAR Corpus (icu word tokenize) """ _path = get_corpus_path(_FILENAME) _word_freqs = defaultdict(int) with open(_path, "r", encoding="utf-8-sig") as fh: _data = [i for i in fh.readlines()] del _data[0] for i in _data: _temp = i.strip().split(",") if _temp[0] != " " and '"' not in _temp[0]: _word_freqs[_temp[0]] = int(_temp[-1]) elif _temp[0] == " ": _word_freqs["<s/>"] = int(_temp[-1]) return _word_freqs
def word_freqs() -> List[Tuple[str, int]]: """ Get word frequency from OSCAR Corpus (icu word tokenize) """ word_freqs = [] _path = get_corpus_path(_FILENAME) with open(_path, "r", encoding="utf-8") as f: _data = [i for i in f.readlines()] del _data[0] for line in _data: _temp = line.strip().split(",") if len(_temp) >= 2: if _temp[0] != " " and '"' not in _temp[0]: word_freqs.append((_temp[0], int(_temp[1]))) elif _temp[0] == " ": word_freqs.append(("<s/>", int(_temp[1]))) return word_freqs
def load_wordvector(self, model_name: str): """ Load word vector model. :param str model_name: model name """ self.model_name = model_name self.model = KeyedVectors.load_word2vec_format( get_corpus_path(self.model_name), binary=True, unicode_errors="ignore" ) self.WV_DIM = self.model.vector_size if self.model_name == "thai2fit_wv": self.tokenize = THAI2FIT_TOKENIZER.word_tokenize else: self.tokenize = word_tokenize
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_samples = 648241 self.__data_path = get_corpus_path("thai2rom-dataset") if not self.__data_path: download("thai2rom-dataset") self.__data_path = get_corpus_path("thai2rom-dataset") self.__input_texts = [] self.__target_texts = [] self.__input_characters = set() self.__target_characters = set() with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh: self.__lines = self.__fh.read().split("\n") for line in self.__lines[: min(self.__num_samples, len(self.__lines) - 1)]: input_text, target_text = line.split("\t") if len(input_text) < 30 and len(target_text) < 90: target_text = "\t" + target_text + "\n" self.__input_texts.append(input_text) self.__target_texts.append(target_text) for char in input_text: if char not in self.__input_characters: self.__input_characters.add(char) for char in target_text: if char not in self.__target_characters: self.__target_characters.add(char) self.__input_characters = sorted(list(self.__input_characters)) self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) self.__max_encoder_seq_length = max([len(text) for text in self.__input_texts]) self.__max_decoder_seq_length = max([len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)""" self.__input_token_index = dict( [(char, i) for i, char in enumerate(self.__input_characters)] ) self.__target_token_index = dict( [(char, i) for i, char in enumerate(self.__target_characters)] ) self.__encoder_input_data = np.zeros( ( len(self.__input_texts), self.__max_encoder_seq_length, self.__num_encoder_tokens, ), dtype="float32", ) for i, input_text in enumerate(self.__input_texts): for t, char in enumerate(input_text): self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom") if not self.__filemodel: download("thai2rom") self.__filemodel = get_corpus_path("thai2rom") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2 ].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim,), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim,), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs ) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, ) self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items() ) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items() )
def _download() -> str: path = get_corpus_path("thai2fit_wv") if not path: download_data("thai2fit_wv") path = get_corpus_path("thai2fit_wv") return path
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__input_token_index = { ' ': 0, '!': 1, '"': 2, '(': 3, ')': 4, '-': 5, '.': 6, '0': 7, '1': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '\xa0': 17, 'ก': 18, 'ข': 19, 'ฃ': 20, 'ค': 21, 'ฅ': 22, 'ฆ': 23, 'ง': 24, 'จ': 25, 'ฉ': 26, 'ช': 27, 'ซ': 28, 'ฌ': 29, 'ญ': 30, 'ฎ': 31, 'ฏ': 32, 'ฐ': 33, 'ฑ': 34, 'ฒ': 35, 'ณ': 36, 'ด': 37, 'ต': 38, 'ถ': 39, 'ท': 40, 'ธ': 41, 'น': 42, 'บ': 43, 'ป': 44, 'ผ': 45, 'ฝ': 46, 'พ': 47, 'ฟ': 48, 'ภ': 49, 'ม': 50, 'ย': 51, 'ร': 52, 'ฤ': 53, 'ล': 54, 'ฦ': 55, 'ว': 56, 'ศ': 57, 'ษ': 58, 'ส': 59, 'ห': 60, 'ฬ': 61, 'อ': 62, 'ฮ': 63, 'ฯ': 64, 'ะ': 65, 'ั': 66, 'า': 67, 'ำ': 68, 'ิ': 69, 'ี': 70, 'ึ': 71, 'ื': 72, 'ุ': 73, 'ู': 74, 'ฺ': 75, 'เ': 76, 'แ': 77, 'โ': 78, 'ใ': 79, 'ไ': 80, 'ๅ': 81, 'ๆ': 82, '็': 83, '่': 84, '้': 85, '๊': 86, '๋': 87, '์': 88, 'ํ': 89, '๙': 90 } self.__target_token_index = { '\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '(': 5, ')': 6, '-': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, 'a': 18, 'b': 19, 'c': 20, 'd': 21, 'e': 22, 'f': 23, 'g': 24, 'h': 25, 'i': 26, 'k': 27, 'l': 28, 'm': 29, 'n': 30, 'o': 31, 'p': 32, 'r': 33, 's': 34, 't': 35, 'u': 36, 'w': 37, 'y': 38 } self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items()) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items()) self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_encoder_tokens = 91 self.__num_decoder_tokens = 39 self.__max_encoder_seq_length = 20 self.__max_decoder_seq_length = 22 # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom-v2") if not self.__filemodel: download("thai2rom-v2") self.__filemodel = get_corpus_path("thai2rom-v2") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, )
def __init__(self): self.thai2fit_wv = get_corpus_path('thai2fit_wv') self.load_w2v()
def __init__(self): """ Transliteration of Thai words Now supports Thai to Latin (romanization) """ self.__batch_size = 64 self.__epochs = 100 self.__latent_dim = 256 self.__num_samples = 648241 self.__data_path = get_corpus_path("thai2rom-dataset") if not self.__data_path: download("thai2rom-dataset") self.__data_path = get_corpus_path("thai2rom-dataset") self.__input_texts = [] self.__target_texts = [] self.__input_characters = set() self.__target_characters = set() with open(self.__data_path, "r", encoding="utf-8-sig") as self.__fh: self.__lines = self.__fh.read().split("\n") for line in self.__lines[:min(self.__num_samples, len(self.__lines) - 1)]: input_text, target_text = line.split("\t") if len(input_text) < 30 and len(target_text) < 90: target_text = "\t" + target_text + "\n" self.__input_texts.append(input_text) self.__target_texts.append(target_text) for char in input_text: if char not in self.__input_characters: self.__input_characters.add(char) for char in target_text: if char not in self.__target_characters: self.__target_characters.add(char) self.__input_characters = sorted(list(self.__input_characters)) self.__target_characters = sorted(list(self.__target_characters)) self.__num_encoder_tokens = len(self.__input_characters) self.__num_decoder_tokens = len(self.__target_characters) self.__max_encoder_seq_length = max( [len(text) for text in self.__input_texts]) self.__max_decoder_seq_length = max( [len(text) for text in self.__target_texts]) """print('Number of samples:', len(self.input_texts)) print('Number of unique input tokens:', self.num_encoder_tokens) print('Number of unique output tokens:', self.num_decoder_tokens) print('Max sequence length for inputs:', self.max_encoder_seq_length) print('Max sequence length for outputs:', self.max_decoder_seq_length)""" self.__input_token_index = dict([ (char, i) for i, char in enumerate(self.__input_characters) ]) self.__target_token_index = dict([ (char, i) for i, char in enumerate(self.__target_characters) ]) self.__encoder_input_data = np.zeros( ( len(self.__input_texts), self.__max_encoder_seq_length, self.__num_encoder_tokens, ), dtype="float32", ) for i, input_text in enumerate(self.__input_texts): for t, char in enumerate(input_text): self.__encoder_input_data[i, t, self.__input_token_index[char]] = 1. # Restore the model and construct the encoder and decoder. self.__filemodel = get_corpus_path("thai2rom") if not self.__filemodel: download("thai2rom") self.__filemodel = get_corpus_path("thai2rom") self.__model = load_model(self.__filemodel) self.__encoder_inputs = self.__model.input[0] # input_1 self.__encoder_outputs, self.__state_h_enc, self.__state_c_enc = self.__model.layers[ 2].output # lstm_1 self.__encoder_states = [self.__state_h_enc, self.__state_c_enc] self.__encoder_model = Model(self.__encoder_inputs, self.__encoder_states) self.__decoder_inputs = self.__model.input[1] # input_2 self.__decoder_state_input_h = Input(shape=(self.__latent_dim, ), name="input_3") self.__decoder_state_input_c = Input(shape=(self.__latent_dim, ), name="input_4") self.__decoder_states_inputs = [ self.__decoder_state_input_h, self.__decoder_state_input_c, ] self.__decoder_lstm = self.__model.layers[3] self.__decoder_outputs, self.__state_h_dec, self.__state_c_dec = self.__decoder_lstm( self.__decoder_inputs, initial_state=self.__decoder_states_inputs) self.__decoder_states = [self.__state_h_dec, self.__state_c_dec] self.__decoder_dense = self.__model.layers[4] self.__decoder_outputs = self.__decoder_dense(self.__decoder_outputs) self.__decoder_model = Model( [self.__decoder_inputs] + self.__decoder_states_inputs, [self.__decoder_outputs] + self.__decoder_states, ) self.__reverse_input_char_index = dict( (i, char) for char, i in self.__input_token_index.items()) self.__reverse_target_char_index = dict( (i, char) for char, i in self.__target_token_index.items())
features["word.next_isspace"] = next_word.isspace() features["word.next_isdigit"] = next_word.isdigit() features["word.next_postag"] = next_pos else: features["EOS"] = True # End of Sequence return features def _extract_features(doc): return [_doc2features(doc, i) for i in range(len(doc))] _CORPUS_NAME = "lst20-cls" tagger = pycrfsuite.Tagger() tagger.open(get_corpus_path(_CORPUS_NAME)) def segment(doc: List[str]) -> List[List[str]]: word_tags = pos_tag(doc, corpus="lst20") features = _extract_features(word_tags) word_markers = list(zip(doc, tagger.tag(features))) clauses = [] temp = [] len_doc = len(doc) - 1 for i, word_marker in enumerate(word_markers): word, marker = word_marker if marker == "E_CLS" or i == len_doc: temp.append(word) clauses.append(temp)
rm_useless_newlines, rm_useless_spaces, spec_add_spaces, ungroup_emoji, ) from pythainlp.util import reorder_vowels device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _MODEL_NAME_LSTM = "wiki_lm_lstm" _ITOS_NAME_LSTM = "wiki_itos_lstm" # Pretrained model paths THWIKI_LSTM = dict( wgts_fname=get_corpus_path(_MODEL_NAME_LSTM), itos_fname=get_corpus_path(_ITOS_NAME_LSTM), ) # Preprocessing rules for Thai text # dense features pre_rules_th = [ replace_rep_after, fix_html, reorder_vowels, spec_add_spaces, rm_useless_spaces, rm_useless_newlines, rm_brackets, replace_url, ]
def __init__(self): """ Thai named-entity recognizer. """ self.crf = CRFTagger() self.crf.open(get_corpus_path(_CORPUS_NAME))
def _lst20_tagger(): global _LST20_TAGGER if not _LST20_TAGGER: _LST20_TAGGER = PerceptronTagger( path=get_corpus_path(_LST20_TAGGER_NAME, version="0.2.3")) return _LST20_TAGGER
from typing import List from symspellpy import SymSpell, Verbosity from pythainlp.corpus import get_corpus_path from pythainlp.corpus import path_pythainlp_corpus from pythainlp.tokenize import word_tokenize _UNIGRAM = "tnc_freq.txt" _BIGRAM = "tnc_bigram_word_freqs" sym_spell = SymSpell() sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM), 0, 1, separator='\t', encoding="utf-8-sig") sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM), 0, 2, separator='\t', encoding="utf-8-sig") def spell(text: str, max_edit_distance: int = 2) -> List[str]: return [ str(i).split(',')[0] for i in list( sym_spell.lookup( text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance)) ] def correct(text: str, max_edit_distance: int = 1) -> str: