def test_sylabelize(self): text = "Thủ tướng: Chỉ số thị trường chứng khoán Việt Nam trong top 3 thế giới" actual = ViTokenizer.sylabelize(text) expected = [ 'Thủ', 'tướng', ':', 'Chỉ', 'số', 'thị', 'trường', 'chứng', 'khoán', 'Việt', 'Nam', 'trong', 'top', '3', 'thế', 'giới' ] self.assertEqual(actual, expected)
def test_sylabelize_2(self): texts = open("tokenize_sets.txt").readlines() n = int(len(texts) / 3) for i in range(n + 1): text = texts[i * 3].strip() expected = texts[i * 3 + 1].strip() actual_text = " ".join(ViTokenizer.sylabelize(text)) self.assertEqual(actual_text, expected)
def get_sentences(paragraph): tokenized_sentences = [ ViTokenizer.tokenize(sentence) for sentence in re.split( r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?!])\s+', paragraph) ] sentences = [[(token, ) for token in re.sub( r'(?<=\d\s[/-])\s|(?=\s[/-]\s\d)\s', '', sentence).split()] for sentence in tokenized_sentences] return sentences
class FeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self): self.tokenizer = ViTokenizer() self.pos_tagger = ViPosTagger() def fit(self, *_): return self def transform(self, X, y=None, **fit_params): result = X.apply(lambda text: self.tokenizer.tokenize(text)) return result
class WorldTransformer(BaseEstimator, TransformerMixin): def __init__(self): super().__init__() self.tokenizer = ViTokenizer() self.pos_tagger = ViPosTagger() def fit(self, *_): return self def transform(self, X): result = X.apply(lambda text: self.tokenizer.tokenize(text)) return result
class FeaturesTransformer(BaseEstimator, TransformerMixin): def __init__(self, stop_words_fn): self.tokenizer = ViTokenizer() self.SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\'' self.STOP_WORDS = load_stop_words(stop_words_fn) def fit(self, *_): return self def remove_stop_words(self, text): return ' '.join([ token for token in re.split('\\s+', text) if token not in self.STOP_WORDS and token not in self.SPECIAL_CHARACTER ]) def transform(self, X, y=None, **fit_params): return [self.remove_stop_words(self.tokenizer.tokenize(x)) for x in X]
# print(content[0]) # print(ViTokenizer.tokenize(content[0])) content = content[:1000] # content = content[:1000] # content = content[2000:3000] print(len(content)) label = [] for i in range(0, len(content)): topic = regex.search(content[i]) label.append(topic[0]) temp_str = content[i].replace(topic[0], '') temp_str = ' '.join( [x.strip(SPECIAL_CHARACTER).lower() for x in temp_str.split()]) temp_str = ViTokenizer.tokenize(temp_str) content[i] = temp_str # print(label) encoder = LabelEncoder() label = encoder.fit_transform(label) # print(label) # đổ dữ liệu vào data frame df = pd.DataFrame(content, columns=['sentence']) df['label'] = label # print(df) labels = df['label'].values # print(labels)
def __init__(self, stop_words_fn): self.tokenizer = ViTokenizer() self.SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\'' self.STOP_WORDS = load_stop_words(stop_words_fn)
def __init__(self): self.tokenizer = ViTokenizer() self.pos_tagger = ViPosTagger()
def tokenize(terms): terms = ViTokenizer.tokenize(terms) terms = [ f"\"{re.sub(r'_', ' ', term)}\"" for term in re.findall(r'\S+', terms) ] return ' '.join(terms)
def tokenize(s): s = re.sub(r'\d+([.,]\d+)?', '__NUM__', s.lower()) tokenized = ViTokenizer.tokenize(s) return tokenized