Ejemplo n.º 1
0
 def test_sylabelize(self):
     text = "Thủ tướng: Chỉ số thị trường chứng khoán Việt Nam trong top 3 thế giới"
     actual = ViTokenizer.sylabelize(text)
     expected = [
         'Thủ', 'tướng', ':', 'Chỉ', 'số', 'thị', 'trường', 'chứng',
         'khoán', 'Việt', 'Nam', 'trong', 'top', '3', 'thế', 'giới'
     ]
     self.assertEqual(actual, expected)
Ejemplo n.º 2
0
 def test_sylabelize_2(self):
     texts = open("tokenize_sets.txt").readlines()
     n = int(len(texts) / 3)
     for i in range(n + 1):
         text = texts[i * 3].strip()
         expected = texts[i * 3 + 1].strip()
         actual_text = " ".join(ViTokenizer.sylabelize(text))
         self.assertEqual(actual_text, expected)
Ejemplo n.º 3
0
def get_sentences(paragraph):
    tokenized_sentences = [
        ViTokenizer.tokenize(sentence) for sentence in re.split(
            r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=[.?!])\s+', paragraph)
    ]
    sentences = [[(token, ) for token in re.sub(
        r'(?<=\d\s[/-])\s|(?=\s[/-]\s\d)\s', '', sentence).split()]
                 for sentence in tokenized_sentences]
    return sentences
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = ViTokenizer()
        self.pos_tagger = ViPosTagger()

    def fit(self, *_):
        return self

    def transform(self, X, y=None, **fit_params):
        result = X.apply(lambda text: self.tokenizer.tokenize(text))
        return result
Ejemplo n.º 5
0
class WorldTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.tokenizer = ViTokenizer()
        self.pos_tagger = ViPosTagger()

    def fit(self, *_):
        return self

    def transform(self, X):
        result = X.apply(lambda text: self.tokenizer.tokenize(text))
        return result
Ejemplo n.º 6
0
class FeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words_fn):
        self.tokenizer = ViTokenizer()
        self.SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\''
        self.STOP_WORDS = load_stop_words(stop_words_fn)

    def fit(self, *_):
        return self

    def remove_stop_words(self, text):
        return ' '.join([
            token for token in re.split('\\s+', text)
            if token not in self.STOP_WORDS
            and token not in self.SPECIAL_CHARACTER
        ])

    def transform(self, X, y=None, **fit_params):
        return [self.remove_stop_words(self.tokenizer.tokenize(x)) for x in X]
# print(content[0])
# print(ViTokenizer.tokenize(content[0]))

content = content[:1000]
# content = content[:1000]
# content = content[2000:3000]
print(len(content))

label = []
for i in range(0, len(content)):
    topic = regex.search(content[i])
    label.append(topic[0])
    temp_str = content[i].replace(topic[0], '')
    temp_str = ' '.join(
        [x.strip(SPECIAL_CHARACTER).lower() for x in temp_str.split()])
    temp_str = ViTokenizer.tokenize(temp_str)
    content[i] = temp_str

# print(label)

encoder = LabelEncoder()
label = encoder.fit_transform(label)

# print(label)

# đổ dữ liệu vào data frame
df = pd.DataFrame(content, columns=['sentence'])
df['label'] = label
# print(df)
labels = df['label'].values
# print(labels)
Ejemplo n.º 8
0
 def __init__(self, stop_words_fn):
     self.tokenizer = ViTokenizer()
     self.SPECIAL_CHARACTER = '0123456789%@$.,=+-!;/()*"&^:#|\n\t\''
     self.STOP_WORDS = load_stop_words(stop_words_fn)
 def __init__(self):
     self.tokenizer = ViTokenizer()
     self.pos_tagger = ViPosTagger()
Ejemplo n.º 10
0
def tokenize(terms):
    terms = ViTokenizer.tokenize(terms)
    terms = [
        f"\"{re.sub(r'_', ' ', term)}\"" for term in re.findall(r'\S+', terms)
    ]
    return ' '.join(terms)
Ejemplo n.º 11
0
def tokenize(s):
    s = re.sub(r'\d+([.,]\d+)?', '__NUM__', s.lower())
    tokenized = ViTokenizer.tokenize(s)
    return tokenized