Ejemplo n.º 1
0
    def __init__(self,
                 bi_grams_path='bi_grams.txt',
                 tri_grams_path='tri_grams.txt'):
        # lib_grams_path = 'VNTQcorpus-big.txt'):

        self.bi_grams = load_n_grams(bi_grams_path)
        self.tri_grams = load_n_grams(tri_grams_path)
Ejemplo n.º 2
0
 def __init__(self, bi_grams_path='bi_grams.txt', tri_grams_path='tri_grams.txt'):
     """
     Initial config
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     """
     self.bi_grams = load_n_grams(bi_grams_path)
     self.tri_grams = load_n_grams(tri_grams_path)
 def __init__(self, bi_grams_path='/home/tuannm/mine/vnexpress-texts-classification/tokenization/bi_grams.txt', tri_grams_path='/home/tuannm/mine/vnexpress-texts-classification/tokenization/tri_grams.txt'):
     """
     Initial config
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     """
     self.bi_grams = load_n_grams(bi_grams_path)
     self.tri_grams = load_n_grams(tri_grams_path)
Ejemplo n.º 4
0
 def __init__(self,
              bi_grams_path='./chatbot/tokenization/bi_grams.txt',
              tri_grams_path='./chatbot/tokenization/tri_grams.txt'):
     """
     Initial config
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     """
     self.bi_grams = load_n_grams(bi_grams_path)
     self.tri_grams = load_n_grams(tri_grams_path)
Ejemplo n.º 5
0
    def __init__(self,
                 config_root_path="",
                 bi_grams_path="bi_grams.txt",
                 tri_grams_path="tri_grams.txt",
                 crf_config_path="crf_config.txt",
                 features_path="crf_features.txt",
                 model_path="vi-segmentation.crfsuite",
                 load_data_f_file=load_data_from_dir,
                 base_lib="sklearn_crfsuite"):
        """
        Initial config
        :param config_root_path: path to directory where you put config files such as bi_grams.txt, tri_grams.txt, ...
        :param bi_grams_path: path to bi-grams set
        :param tri_grams_path: path to tri-grams set
        :param crf_config_path: path to crf model config file
        :param features_path: path to feature config file
        :param model_path: path to save or load model to/from file
        :param load_data_f_file: method using to load data from file to return sentences and labels
        :param base_lib: library to use for CRF algorithm, default: sklearn_crfsuite, other choices are pycrfsuite
        """

        self.bi_grams = load_n_grams(config_root_path + bi_grams_path)
        self.tri_grams = load_n_grams(config_root_path + tri_grams_path)
        self.crf_config = load_crf_config(config_root_path + crf_config_path)
        self.features_cfg_arr = load_crf_config(config_root_path +
                                                features_path)
        self.center_id = int((len(self.features_cfg_arr) - 1) / 2)
        self.function_dict = {
            'bias':
            lambda word, *args: 1.0,
            'word.lower()':
            lambda word, *args: word.lower(),
            'word.isupper()':
            lambda word, *args: word.isupper(),
            'word.istitle()':
            lambda word, *args: word.istitle(),
            'word.isdigit()':
            lambda word, *args: word.isdigit(),
            'word.bi_gram()':
            lambda word, word1, relative_id, *args: self._check_bi_gram(
                [word, word1], relative_id),
            'word.tri_gram()':
            lambda word, word1, word2, relative_id, *args: self.
            _check_tri_gram([word, word1, word2], relative_id)
        }
        self.model_path = model_path
        self.load_data_from_file = load_data_f_file
        self.tagger = None
        self.base_lib = base_lib
Ejemplo n.º 6
0
 def __init__(self,
              bi_grams_path='bi_grams.txt',
              tri_grams_path='tri_grams.txt',
              crf_config_path='crf_config.txt',
              features_path='crf_features.txt',
              model_path='vi-segmentation.crfsuite',
              load_data_f_file=load_data_from_dir):
     """
     Initial config
     :param bi_grams_path: path to bi-grams set
     :param tri_grams_path: path to tri-grams set
     :param crf_config_path: path to crf model config file
     :param features_path: path to feature config file
     :param model_path: path to save or load model to/from file
     :param load_data_f_file: method using to load data from file to return sentences and labels
     """
     self.bi_grams = load_n_grams(bi_grams_path)
     self.tri_grams = load_n_grams(tri_grams_path)
     self.crf_config = load_crf_config(crf_config_path)
     self.features_cfg_arr = load_crf_config(features_path)
     self.center_id = int((len(self.features_cfg_arr) - 1) / 2)
     self.function_dict = {
         'bias':
         lambda word, *args: 1.0,
         'lower':
         lambda word, *args: word.lower(),
         'isupper':
         lambda word, *args: word.isupper(),
         'istitle':
         lambda word, *args: word.istitle(),
         'isdigit':
         lambda word, *args: word.isdigit(),
         'bi_gram':
         lambda word, word1, relative_id, *args: self._check_bi_gram(
             [word, word1], relative_id),
         'tri_gram':
         lambda word, word1, word2, relative_id, *args: self.
         _check_tri_gram([word, word1, word2], relative_id)
     }
     self.model_path = model_path
     self.load_data_from_file = load_data_f_file
     self.tagger = None
 def __init__(self,
              root_path="",
              bi_grams_path='bi_grams.txt',
              tri_grams_path='tri_grams.txt',
              crf_config_path='crf_config.txt',
              features_path='crf_features.txt',
              model_path='vi-word-segment',
              load_data_f_file=load_data_from_dir,
              base_lib='sklearn_crfsuite'):
     self.bi_grams = load_n_grams(root_path + bi_grams_path)
     self.tri_grams = load_n_grams(root_path + tri_grams_path)
     self.crf_config = load_crf_config(root_path + crf_config_path)
     self.features_crf_arg = load_crf_config(root_path + features_path)
     self.center_id = int((len(self.features_crf_arg) - 1) / 2)
     self.function_dict = {
         'bias':
         lambda word, *args: 1.0,
         'word.lower()':
         lambda word, *args: word.lower(),
         'word.isupper()':
         lambda word, *args: word.isupper(),
         'word.istitle()':
         lambda word, *args: word.istitle(),
         'word.isdigit()':
         lambda word, *args: word.isdigit(),
         'word.bi_gram()':
         lambda word, word1, relative_id, *args: self._check_bi_gram(
             [word, word1], relative_id),
         'word.tri_gram()':
         lambda word, word1, word2, relative_id, *args: self.
         _check_tri_gram([word, word1, word2], relative_id)
     }
     self.model_path = model_path
     self.load_data_from_file = load_data_f_file
     self.tagger = None
     self.base_lib = base_lib
Ejemplo n.º 8
0
import re;
from base_tokenizer import BaseTokenizer
from utils import load_n_grams
from pyvi.pyvi import ViTokenizer, ViPosTagger
from sklearn.base import TransformerMixin, BaseEstimator
p

text = 'tốc độ truyền thông tin ngày càng nhanh'


biGrams= load_n_grams('bi-grams.txt');
triGrams = load_n_grams('tri-grams.txt')


def tokenize(text):

    syllables = BaseTokenizer.syllablize(text)
    lenofSen = len(syllables)
    currId = 0
    result = []
    done = False
    while (currId < lenofSen) and (not done):

        currWord = syllables[currId]
        print(currWord)
        if currId >= (lenofSen - 1): # nêu là từ cuối cùng của câu
            result.append(currWord)
            done = True
        else:
            nextWord = syllables[currId + 1]
            twoWord = ' '.join([currWord.lower(), nextWord.lower()])