Example #1
0
class StatisticTokenize():

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_list = SortedWords()
        self.preprocessor = Preprocessor()

    def create_model(self):
        fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r')
        lines_file = fo.read().split('\n')

        self.tag_model = {'word': [],
                          'role': [],
                          'rate': []}
        for line in lines_file:
            if len(line) == 2:
                self.tag_model['role'][-1] = self.tag_model['role'][-1] + '-' + line[-1]
            else:
                self.tag_model['word'].append(line[:-2])
                self.tag_model['role'].append(line[-1])
                self.tag_model['rate'].append(1)

    def read_model(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv')
        self.tag_model = {'word': [],
                          'role': [],
                          'rate': []}
        for i in range(len(pd_file['word'])):
            self.tag_model['word'].append(pd_file['word'][i])
            self.tag_model['role'].append(pd_file['role'][i])
            self.tag_model['rate'].append(pd_file['rate'][i])

    def sort_model(self):
        quick_sort = QuickSort()
        self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate'])
        self.sorted_list.set(self.tag_model['word'])

    def read_text_train(self):
        fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r')
        self.text_train = fo.read()
        self.text_train = self.text_train.split('\n')
        self.text_train = self.preprocessor.remove_mark_docs(self.text_train)

    def statistic(self):
        for sentence in self.text_train:
            self.statistic_doc(sentence)

    def statistic_doc(self, text):
        text = text.split(' ')
        for len_word in range(1, 4):
            for i in range(len(text) - len_word + 1):
                word = ' '.join(text[i:i + len_word])
                position = self.sorted_list.find(word)

                if self.sorted_list.words[position] == word:
                    self.tag_model['rate'][position] += 1

    def save_model(self):
        self.tag_model = pd.DataFrame.from_dict(self.tag_model)
        self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
Example #2
0
class StatisticTokenize():
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_list = SortedWords()
        self.preprocessor = Preprocessor()

    def create_model(self):
        fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r')
        lines_file = fo.read().split('\n')

        self.tag_model = {'word': [], 'role': [], 'rate': []}
        for line in lines_file:
            if len(line) == 2:
                self.tag_model['role'][
                    -1] = self.tag_model['role'][-1] + '-' + line[-1]
            else:
                self.tag_model['word'].append(line[:-2])
                self.tag_model['role'].append(line[-1])
                self.tag_model['rate'].append(1)

    def read_model(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv')
        self.tag_model = {'word': [], 'role': [], 'rate': []}
        for i in range(len(pd_file['word'])):
            self.tag_model['word'].append(pd_file['word'][i])
            self.tag_model['role'].append(pd_file['role'][i])
            self.tag_model['rate'].append(pd_file['rate'][i])

    def sort_model(self):
        quick_sort = QuickSort()
        self.tag_model = quick_sort.get_dataframe(self.tag_model,
                                                  ['word', 'role', 'rate'])
        self.sorted_list.set(self.tag_model['word'])

    def read_text_train(self):
        fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r')
        self.text_train = fo.read()
        self.text_train = self.text_train.split('\n')
        self.text_train = self.preprocessor.remove_mark_docs(self.text_train)

    def statistic(self):
        for sentence in self.text_train:
            self.statistic_doc(sentence)

    def statistic_doc(self, text):
        text = text.split(' ')
        for len_word in range(1, 4):
            for i in range(len(text) - len_word + 1):
                word = ' '.join(text[i:i + len_word])
                position = self.sorted_list.find(word)

                if self.sorted_list.words[position] == word:
                    self.tag_model['rate'][position] += 1

    def save_model(self):
        self.tag_model = pd.DataFrame.from_dict(self.tag_model)
        self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv',
                              index=False)
Example #3
0
class DocToVector():

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()
        self.preprocess = Preprocessor()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/word_frequency.csv')
        self.sorted_words.set(pd_file['word'])

    def add_to_dict_counter(self, docs):
        docs = self.preprocess.split_space(docs)

    def tf_idf(self, frequency_sentence, frequency_docs, num_words):
        return (1 + math.log(frequency_sentence))*math.log(frequency_docs*1.0/num_words)
Example #4
0
class DocToVector():
    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()
        self.preprocess = Preprocessor()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir +
                              '/../resources/word_frequency.csv')
        self.sorted_words.set(pd_file['word'])

    def add_to_dict_counter(self, docs):
        docs = self.preprocess.split_space(docs)

    def tf_idf(self, frequency_sentence, frequency_docs, num_words):
        return (1 + math.log(frequency_sentence)) * math.log(
            frequency_docs * 1.0 / num_words)
Example #5
0
class SeparateWord():
    '''
    separate combine words by character: [s, en, n]
    '''
    separate_words = ['s', 'en', 'n']

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/en_dict.csv')
        self.sorted_words.set(pd_file['word'])

    def find(self, word):
        if '-' in word:
            self.format_A_B(word)
        elif self.separate_words[1] in word and self.format_AsB(word):
            pass
        else:
            self.format_AB(word)

    def format_A_B(self, word):
        '''
        word has format: A-B

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        split_word = word.split('-')
        if self.sorted_words.exist(split_word[1]):
            if self.sorted_words.exist(split_word[0]):
                print split_word[0] + ' ' + split_word[1]
                return True
            elif split_word[0][-1] == 's' and self.sorted_words.exist(
                    split_word[0][:-1]):
                print split_word[0][:-1] + ' ' + split_word[1]
                return True
        return False

    def format_AsB(self, word):
        '''
        word has format: A[character separate]B

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        id_separate = 1
        for i in range(1, len(word) - len(self.separate_words[id_separate])):
            if word[i:i + len(
                    self.separate_words[id_separate]
            )] == self.separate_words[id_separate] and self.sorted_words.exist(
                    word[:i]) and self.sorted_words.exist(
                        word[i + len(self.separate_words[id_separate]):]):
                print word[:i] + ' ' + word[
                    i + len(self.separate_words[id_separate]):]
                return True
        return False

    def format_AB(self, word):
        '''
        word has format: AB

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        word_position = self.sorted_words.find(word)
        while word[0] == self.sorted_words.words[word_position][
                0] and word_position > 0:
            word_position -= 1
            if word.startswith(
                    self.sorted_words.words[word_position]
            ) and self.sorted_words.exist(
                    word[len(self.sorted_words.words[word_position]):]):
                print self.sorted_words.words[word_position] + ' ' + word[
                    len(self.sorted_words.words[word_position]):]
                return True
        return False
Example #6
0
class SeparateWord():
    '''
    separate combine words by character: [s, en, n]
    '''
    separate_words = ['s', 'en', 'n']

    def __init__(self):
        self.current_dir = os.path.dirname(__file__)
        self.sorted_words = SortedWords()
        self.get_dictionary()

    def get_dictionary(self):
        pd_file = pd.read_csv(self.current_dir + '/../resources/en_dict.csv')
        self.sorted_words.set(pd_file['word'])

    def find(self, word):
        if '-' in word:
            self.format_A_B(word)
        elif self.separate_words[1] in word and self.format_AsB(word):
            pass
        else:
            self.format_AB(word)

    def format_A_B(self, word):
        '''
        word has format: A-B

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        split_word = word.split('-')
        if self.sorted_words.exist(split_word[1]):
            if self.sorted_words.exist(split_word[0]):
                print split_word[0] + ' ' + split_word[1]
                return True
            elif split_word[0][-1] == 's' and self.sorted_words.exist(split_word[0][:-1]):
                print split_word[0][:-1] + ' ' + split_word[1]
                return True
        return False

    def format_AsB(self, word):
        '''
        word has format: A[character separate]B

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        id_separate = 1
        for i in range(1, len(word) - len(self.separate_words[id_separate])):
            if word[i:i + len(self.separate_words[id_separate])] == self.separate_words[id_separate] and self.sorted_words.exist(word[:i]) and self.sorted_words.exist(word[i + len(self.separate_words[id_separate]):]):
                print word[:i] + ' ' + word[i + len(self.separate_words[id_separate]):]
                return True
        return False

    def format_AB(self, word):
        '''
        word has format: AB

        Args:
            word: input word

        Returns:
            bool: True if input word can separate
        '''
        word_position = self.sorted_words.find(word)
        while word[0] == self.sorted_words.words[word_position][0] and word_position > 0:
            word_position -= 1
            if word.startswith(self.sorted_words.words[word_position]) and self.sorted_words.exist(word[len(self.sorted_words.words[word_position]):]):
                print self.sorted_words.words[word_position] + ' ' + word[len(self.sorted_words.words[word_position]):]
                return True
        return False