class StatisticTokenize(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_list = SortedWords() self.preprocessor = Preprocessor() def create_model(self): fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r') lines_file = fo.read().split('\n') self.tag_model = {'word': [], 'role': [], 'rate': []} for line in lines_file: if len(line) == 2: self.tag_model['role'][-1] = self.tag_model['role'][-1] + '-' + line[-1] else: self.tag_model['word'].append(line[:-2]) self.tag_model['role'].append(line[-1]) self.tag_model['rate'].append(1) def read_model(self): pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv') self.tag_model = {'word': [], 'role': [], 'rate': []} for i in range(len(pd_file['word'])): self.tag_model['word'].append(pd_file['word'][i]) self.tag_model['role'].append(pd_file['role'][i]) self.tag_model['rate'].append(pd_file['rate'][i]) def sort_model(self): quick_sort = QuickSort() self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate']) self.sorted_list.set(self.tag_model['word']) def read_text_train(self): fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r') self.text_train = fo.read() self.text_train = self.text_train.split('\n') self.text_train = self.preprocessor.remove_mark_docs(self.text_train) def statistic(self): for sentence in self.text_train: self.statistic_doc(sentence) def statistic_doc(self, text): text = text.split(' ') for len_word in range(1, 4): for i in range(len(text) - len_word + 1): word = ' '.join(text[i:i + len_word]) position = self.sorted_list.find(word) if self.sorted_list.words[position] == word: self.tag_model['rate'][position] += 1 def save_model(self): self.tag_model = pd.DataFrame.from_dict(self.tag_model) self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
class StatisticTokenize(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_list = SortedWords() self.preprocessor = Preprocessor() def create_model(self): fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r') lines_file = fo.read().split('\n') self.tag_model = {'word': [], 'role': [], 'rate': []} for line in lines_file: if len(line) == 2: self.tag_model['role'][ -1] = self.tag_model['role'][-1] + '-' + line[-1] else: self.tag_model['word'].append(line[:-2]) self.tag_model['role'].append(line[-1]) self.tag_model['rate'].append(1) def read_model(self): pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv') self.tag_model = {'word': [], 'role': [], 'rate': []} for i in range(len(pd_file['word'])): self.tag_model['word'].append(pd_file['word'][i]) self.tag_model['role'].append(pd_file['role'][i]) self.tag_model['rate'].append(pd_file['rate'][i]) def sort_model(self): quick_sort = QuickSort() self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate']) self.sorted_list.set(self.tag_model['word']) def read_text_train(self): fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r') self.text_train = fo.read() self.text_train = self.text_train.split('\n') self.text_train = self.preprocessor.remove_mark_docs(self.text_train) def statistic(self): for sentence in self.text_train: self.statistic_doc(sentence) def statistic_doc(self, text): text = text.split(' ') for len_word in range(1, 4): for i in range(len(text) - len_word + 1): word = ' '.join(text[i:i + len_word]) position = self.sorted_list.find(word) if self.sorted_list.words[position] == word: self.tag_model['rate'][position] += 1 def save_model(self): self.tag_model = pd.DataFrame.from_dict(self.tag_model) self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
class DocToVector(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.get_dictionary() self.preprocess = Preprocessor() def get_dictionary(self): pd_file = pd.read_csv(self.current_dir + '/../resources/word_frequency.csv') self.sorted_words.set(pd_file['word']) def add_to_dict_counter(self, docs): docs = self.preprocess.split_space(docs) def tf_idf(self, frequency_sentence, frequency_docs, num_words): return (1 + math.log(frequency_sentence))*math.log(frequency_docs*1.0/num_words)
class DocToVector(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.get_dictionary() self.preprocess = Preprocessor() def get_dictionary(self): pd_file = pd.read_csv(self.current_dir + '/../resources/word_frequency.csv') self.sorted_words.set(pd_file['word']) def add_to_dict_counter(self, docs): docs = self.preprocess.split_space(docs) def tf_idf(self, frequency_sentence, frequency_docs, num_words): return (1 + math.log(frequency_sentence)) * math.log( frequency_docs * 1.0 / num_words)
class SeparateWord(): ''' separate combine words by character: [s, en, n] ''' separate_words = ['s', 'en', 'n'] def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.get_dictionary() def get_dictionary(self): pd_file = pd.read_csv(self.current_dir + '/../resources/en_dict.csv') self.sorted_words.set(pd_file['word']) def find(self, word): if '-' in word: self.format_A_B(word) elif self.separate_words[1] in word and self.format_AsB(word): pass else: self.format_AB(word) def format_A_B(self, word): ''' word has format: A-B Args: word: input word Returns: bool: True if input word can separate ''' split_word = word.split('-') if self.sorted_words.exist(split_word[1]): if self.sorted_words.exist(split_word[0]): print split_word[0] + ' ' + split_word[1] return True elif split_word[0][-1] == 's' and self.sorted_words.exist( split_word[0][:-1]): print split_word[0][:-1] + ' ' + split_word[1] return True return False def format_AsB(self, word): ''' word has format: A[character separate]B Args: word: input word Returns: bool: True if input word can separate ''' id_separate = 1 for i in range(1, len(word) - len(self.separate_words[id_separate])): if word[i:i + len( self.separate_words[id_separate] )] == self.separate_words[id_separate] and self.sorted_words.exist( word[:i]) and self.sorted_words.exist( word[i + len(self.separate_words[id_separate]):]): print word[:i] + ' ' + word[ i + len(self.separate_words[id_separate]):] return True return False def format_AB(self, word): ''' word has format: AB Args: word: input word Returns: bool: True if input word can separate ''' word_position = self.sorted_words.find(word) while word[0] == self.sorted_words.words[word_position][ 0] and word_position > 0: word_position -= 1 if word.startswith( self.sorted_words.words[word_position] ) and self.sorted_words.exist( word[len(self.sorted_words.words[word_position]):]): print self.sorted_words.words[word_position] + ' ' + word[ len(self.sorted_words.words[word_position]):] return True return False
class SeparateWord(): ''' separate combine words by character: [s, en, n] ''' separate_words = ['s', 'en', 'n'] def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.get_dictionary() def get_dictionary(self): pd_file = pd.read_csv(self.current_dir + '/../resources/en_dict.csv') self.sorted_words.set(pd_file['word']) def find(self, word): if '-' in word: self.format_A_B(word) elif self.separate_words[1] in word and self.format_AsB(word): pass else: self.format_AB(word) def format_A_B(self, word): ''' word has format: A-B Args: word: input word Returns: bool: True if input word can separate ''' split_word = word.split('-') if self.sorted_words.exist(split_word[1]): if self.sorted_words.exist(split_word[0]): print split_word[0] + ' ' + split_word[1] return True elif split_word[0][-1] == 's' and self.sorted_words.exist(split_word[0][:-1]): print split_word[0][:-1] + ' ' + split_word[1] return True return False def format_AsB(self, word): ''' word has format: A[character separate]B Args: word: input word Returns: bool: True if input word can separate ''' id_separate = 1 for i in range(1, len(word) - len(self.separate_words[id_separate])): if word[i:i + len(self.separate_words[id_separate])] == self.separate_words[id_separate] and self.sorted_words.exist(word[:i]) and self.sorted_words.exist(word[i + len(self.separate_words[id_separate]):]): print word[:i] + ' ' + word[i + len(self.separate_words[id_separate]):] return True return False def format_AB(self, word): ''' word has format: AB Args: word: input word Returns: bool: True if input word can separate ''' word_position = self.sorted_words.find(word) while word[0] == self.sorted_words.words[word_position][0] and word_position > 0: word_position -= 1 if word.startswith(self.sorted_words.words[word_position]) and self.sorted_words.exist(word[len(self.sorted_words.words[word_position]):]): print self.sorted_words.words[word_position] + ' ' + word[len(self.sorted_words.words[word_position]):] return True return False