def __init__(self, bi_grams_path='bi_grams.txt', tri_grams_path='tri_grams.txt'): # lib_grams_path = 'VNTQcorpus-big.txt'): self.bi_grams = load_n_grams(bi_grams_path) self.tri_grams = load_n_grams(tri_grams_path)
def __init__(self, bi_grams_path='bi_grams.txt', tri_grams_path='tri_grams.txt'): """ Initial config :param bi_grams_path: path to bi-grams set :param tri_grams_path: path to tri-grams set """ self.bi_grams = load_n_grams(bi_grams_path) self.tri_grams = load_n_grams(tri_grams_path)
def __init__(self, bi_grams_path='/home/tuannm/mine/vnexpress-texts-classification/tokenization/bi_grams.txt', tri_grams_path='/home/tuannm/mine/vnexpress-texts-classification/tokenization/tri_grams.txt'): """ Initial config :param bi_grams_path: path to bi-grams set :param tri_grams_path: path to tri-grams set """ self.bi_grams = load_n_grams(bi_grams_path) self.tri_grams = load_n_grams(tri_grams_path)
def __init__(self, bi_grams_path='./chatbot/tokenization/bi_grams.txt', tri_grams_path='./chatbot/tokenization/tri_grams.txt'): """ Initial config :param bi_grams_path: path to bi-grams set :param tri_grams_path: path to tri-grams set """ self.bi_grams = load_n_grams(bi_grams_path) self.tri_grams = load_n_grams(tri_grams_path)
def __init__(self, config_root_path="", bi_grams_path="bi_grams.txt", tri_grams_path="tri_grams.txt", crf_config_path="crf_config.txt", features_path="crf_features.txt", model_path="vi-segmentation.crfsuite", load_data_f_file=load_data_from_dir, base_lib="sklearn_crfsuite"): """ Initial config :param config_root_path: path to directory where you put config files such as bi_grams.txt, tri_grams.txt, ... :param bi_grams_path: path to bi-grams set :param tri_grams_path: path to tri-grams set :param crf_config_path: path to crf model config file :param features_path: path to feature config file :param model_path: path to save or load model to/from file :param load_data_f_file: method using to load data from file to return sentences and labels :param base_lib: library to use for CRF algorithm, default: sklearn_crfsuite, other choices are pycrfsuite """ self.bi_grams = load_n_grams(config_root_path + bi_grams_path) self.tri_grams = load_n_grams(config_root_path + tri_grams_path) self.crf_config = load_crf_config(config_root_path + crf_config_path) self.features_cfg_arr = load_crf_config(config_root_path + features_path) self.center_id = int((len(self.features_cfg_arr) - 1) / 2) self.function_dict = { 'bias': lambda word, *args: 1.0, 'word.lower()': lambda word, *args: word.lower(), 'word.isupper()': lambda word, *args: word.isupper(), 'word.istitle()': lambda word, *args: word.istitle(), 'word.isdigit()': lambda word, *args: word.isdigit(), 'word.bi_gram()': lambda word, word1, relative_id, *args: self._check_bi_gram( [word, word1], relative_id), 'word.tri_gram()': lambda word, word1, word2, relative_id, *args: self. _check_tri_gram([word, word1, word2], relative_id) } self.model_path = model_path self.load_data_from_file = load_data_f_file self.tagger = None self.base_lib = base_lib
def __init__(self, bi_grams_path='bi_grams.txt', tri_grams_path='tri_grams.txt', crf_config_path='crf_config.txt', features_path='crf_features.txt', model_path='vi-segmentation.crfsuite', load_data_f_file=load_data_from_dir): """ Initial config :param bi_grams_path: path to bi-grams set :param tri_grams_path: path to tri-grams set :param crf_config_path: path to crf model config file :param features_path: path to feature config file :param model_path: path to save or load model to/from file :param load_data_f_file: method using to load data from file to return sentences and labels """ self.bi_grams = load_n_grams(bi_grams_path) self.tri_grams = load_n_grams(tri_grams_path) self.crf_config = load_crf_config(crf_config_path) self.features_cfg_arr = load_crf_config(features_path) self.center_id = int((len(self.features_cfg_arr) - 1) / 2) self.function_dict = { 'bias': lambda word, *args: 1.0, 'lower': lambda word, *args: word.lower(), 'isupper': lambda word, *args: word.isupper(), 'istitle': lambda word, *args: word.istitle(), 'isdigit': lambda word, *args: word.isdigit(), 'bi_gram': lambda word, word1, relative_id, *args: self._check_bi_gram( [word, word1], relative_id), 'tri_gram': lambda word, word1, word2, relative_id, *args: self. _check_tri_gram([word, word1, word2], relative_id) } self.model_path = model_path self.load_data_from_file = load_data_f_file self.tagger = None
def __init__(self, root_path="", bi_grams_path='bi_grams.txt', tri_grams_path='tri_grams.txt', crf_config_path='crf_config.txt', features_path='crf_features.txt', model_path='vi-word-segment', load_data_f_file=load_data_from_dir, base_lib='sklearn_crfsuite'): self.bi_grams = load_n_grams(root_path + bi_grams_path) self.tri_grams = load_n_grams(root_path + tri_grams_path) self.crf_config = load_crf_config(root_path + crf_config_path) self.features_crf_arg = load_crf_config(root_path + features_path) self.center_id = int((len(self.features_crf_arg) - 1) / 2) self.function_dict = { 'bias': lambda word, *args: 1.0, 'word.lower()': lambda word, *args: word.lower(), 'word.isupper()': lambda word, *args: word.isupper(), 'word.istitle()': lambda word, *args: word.istitle(), 'word.isdigit()': lambda word, *args: word.isdigit(), 'word.bi_gram()': lambda word, word1, relative_id, *args: self._check_bi_gram( [word, word1], relative_id), 'word.tri_gram()': lambda word, word1, word2, relative_id, *args: self. _check_tri_gram([word, word1, word2], relative_id) } self.model_path = model_path self.load_data_from_file = load_data_f_file self.tagger = None self.base_lib = base_lib
import re; from base_tokenizer import BaseTokenizer from utils import load_n_grams from pyvi.pyvi import ViTokenizer, ViPosTagger from sklearn.base import TransformerMixin, BaseEstimator p text = 'tốc độ truyền thông tin ngày càng nhanh' biGrams= load_n_grams('bi-grams.txt'); triGrams = load_n_grams('tri-grams.txt') def tokenize(text): syllables = BaseTokenizer.syllablize(text) lenofSen = len(syllables) currId = 0 result = [] done = False while (currId < lenofSen) and (not done): currWord = syllables[currId] print(currWord) if currId >= (lenofSen - 1): # nêu là từ cuối cùng của câu result.append(currWord) done = True else: nextWord = syllables[currId + 1] twoWord = ' '.join([currWord.lower(), nextWord.lower()])