class StatisticTokenize(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_list = SortedWords() self.preprocessor = Preprocessor() def create_model(self): fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r') lines_file = fo.read().split('\n') self.tag_model = {'word': [], 'role': [], 'rate': []} for line in lines_file: if len(line) == 2: self.tag_model['role'][-1] = self.tag_model['role'][-1] + '-' + line[-1] else: self.tag_model['word'].append(line[:-2]) self.tag_model['role'].append(line[-1]) self.tag_model['rate'].append(1) def read_model(self): pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv') self.tag_model = {'word': [], 'role': [], 'rate': []} for i in range(len(pd_file['word'])): self.tag_model['word'].append(pd_file['word'][i]) self.tag_model['role'].append(pd_file['role'][i]) self.tag_model['rate'].append(pd_file['rate'][i]) def sort_model(self): quick_sort = QuickSort() self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate']) self.sorted_list.set(self.tag_model['word']) def read_text_train(self): fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r') self.text_train = fo.read() self.text_train = self.text_train.split('\n') self.text_train = self.preprocessor.remove_mark_docs(self.text_train) def statistic(self): for sentence in self.text_train: self.statistic_doc(sentence) def statistic_doc(self, text): text = text.split(' ') for len_word in range(1, 4): for i in range(len(text) - len_word + 1): word = ' '.join(text[i:i + len_word]) position = self.sorted_list.find(word) if self.sorted_list.words[position] == word: self.tag_model['rate'][position] += 1 def save_model(self): self.tag_model = pd.DataFrame.from_dict(self.tag_model) self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
class StatisticTokenize(): def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_list = SortedWords() self.preprocessor = Preprocessor() def create_model(self): fo = open(self.current_dir + '/../resources/vi_dict.txt', 'r') lines_file = fo.read().split('\n') self.tag_model = {'word': [], 'role': [], 'rate': []} for line in lines_file: if len(line) == 2: self.tag_model['role'][ -1] = self.tag_model['role'][-1] + '-' + line[-1] else: self.tag_model['word'].append(line[:-2]) self.tag_model['role'].append(line[-1]) self.tag_model['rate'].append(1) def read_model(self): pd_file = pd.read_csv(self.current_dir + '/../resources/tag_model.csv') self.tag_model = {'word': [], 'role': [], 'rate': []} for i in range(len(pd_file['word'])): self.tag_model['word'].append(pd_file['word'][i]) self.tag_model['role'].append(pd_file['role'][i]) self.tag_model['rate'].append(pd_file['rate'][i]) def sort_model(self): quick_sort = QuickSort() self.tag_model = quick_sort.get_dataframe(self.tag_model, ['word', 'role', 'rate']) self.sorted_list.set(self.tag_model['word']) def read_text_train(self): fo = open(self.current_dir + '/../resources/VNESEcorpus.txt', 'r') self.text_train = fo.read() self.text_train = self.text_train.split('\n') self.text_train = self.preprocessor.remove_mark_docs(self.text_train) def statistic(self): for sentence in self.text_train: self.statistic_doc(sentence) def statistic_doc(self, text): text = text.split(' ') for len_word in range(1, 4): for i in range(len(text) - len_word + 1): word = ' '.join(text[i:i + len_word]) position = self.sorted_list.find(word) if self.sorted_list.words[position] == word: self.tag_model['rate'][position] += 1 def save_model(self): self.tag_model = pd.DataFrame.from_dict(self.tag_model) self.tag_model.to_csv(self.current_dir + '/../resources/tag_model.csv', index=False)
class NaiveBayesClassify(): ''' classified texts using Naive Bayes ''' def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.preprocessor = Preprocessor() def set_doc_c1(self): fo = open(self.current_dir + '/../resources/pos_text.txt', 'r') self.text_c1 = fo.read() self.text_c1 = self.text_c1.split('\n') self.text_c1 = self.preprocessor.remove_mark_docs(self.text_c1) fo.close() def set_doc_c2(self): fo = open(self.current_dir + '/../resources/neg_text.txt', 'r') self.text_c2 = fo.read() self.text_c2 = self.text_c2.split('\n') self.text_c2 = self.preprocessor.remove_mark_docs(self.text_c2) def set_doc_classify(self): fo = open(self.current_dir + '/../resources/classify_text.txt', 'r') self.text_classify = fo.read() self.text_classify = self.text_classify.split('\n') self.text_classify = self.preprocessor.remove_mark_docs(self.text_classify) fo.close() def probability_cal(self): self.p_c1 = len(self.text_c1)*1.0/(len(self.text_c1) + len(self.text_c2)) self.p_c2 = 1 - self.p_c1 self.word = [] self.tf_c1 = [] self.tf_c2 = [] #add to list for sentence in self.text_c1: s_words = sentence.split(' ') for w in s_words: self.sorted_words.add(word=w) for sentence in self.text_c2: s_words = sentence.split(' ') for w in s_words: self.sorted_words.add(word=w) #cal tf self.tf_c1 = [0 for i in range(len(self.sorted_words.words))] for sentence in self.text_c1: s_words = sentence.split(' ') for w in s_words: self.tf_c1[self.sorted_words.find(w)] += 1 self.tf_c2 = [0 for i in range(len(self.sorted_words.words))] for sentence in self.text_c2: s_words = sentence.split(' ') for w in s_words: self.tf_c2[self.sorted_words.find(w)] += 1 def train(self): sum_tf_c1 = 0 sum_tf_c2 = 0 for i in range(len(self.word)): sum_tf_c1 += self.tf_c1[i] sum_tf_c2 += self.tf_c2[i] self.probability_c1 = [] self.probability_c2 = [] for i in range(len(self.sorted_words.words)): self.probability_c1.append((self.tf_c1[i] + 1.0)/(len(self.sorted_words.words) + sum_tf_c1)) self.probability_c2.append((self.tf_c2[i] + 1.0)/(len(self.sorted_words.words) + sum_tf_c2)) def get_rate_probability_c1(self, word): position = self.sorted_words.find(word) if word == self.sorted_words.words[position]: return self.probability_c1[position]/self.probability_c2[position] else: return 1 def test(self): for sentence in self.text_classify: self.test_doc(sentence) def test_doc(self, text): rate_px_c1 = 1 s_words = text.split(' ') for w in s_words: rate_px_c1 *= self.get_rate_probability_c1(w) if rate_px_c1*self.p_c1 >= self.p_c2: print text + ': belong c1' else: print text + ': belong c2'
class NaiveBayesClassify(): ''' classified texts using Naive Bayes ''' def __init__(self): self.current_dir = os.path.dirname(__file__) self.sorted_words = SortedWords() self.preprocessor = Preprocessor() def set_doc_c1(self): fo = open(self.current_dir + '/../resources/pos_text.txt', 'r') self.text_c1 = fo.read() self.text_c1 = self.text_c1.split('\n') self.text_c1 = self.preprocessor.remove_mark_docs(self.text_c1) fo.close() def set_doc_c2(self): fo = open(self.current_dir + '/../resources/neg_text.txt', 'r') self.text_c2 = fo.read() self.text_c2 = self.text_c2.split('\n') self.text_c2 = self.preprocessor.remove_mark_docs(self.text_c2) def set_doc_classify(self): fo = open(self.current_dir + '/../resources/classify_text.txt', 'r') self.text_classify = fo.read() self.text_classify = self.text_classify.split('\n') self.text_classify = self.preprocessor.remove_mark_docs( self.text_classify) fo.close() def probability_cal(self): self.p_c1 = len( self.text_c1) * 1.0 / (len(self.text_c1) + len(self.text_c2)) self.p_c2 = 1 - self.p_c1 self.word = [] self.tf_c1 = [] self.tf_c2 = [] #add to list for sentence in self.text_c1: s_words = sentence.split(' ') for w in s_words: self.sorted_words.add(word=w) for sentence in self.text_c2: s_words = sentence.split(' ') for w in s_words: self.sorted_words.add(word=w) #cal tf self.tf_c1 = [0 for i in range(len(self.sorted_words.words))] for sentence in self.text_c1: s_words = sentence.split(' ') for w in s_words: self.tf_c1[self.sorted_words.find(w)] += 1 self.tf_c2 = [0 for i in range(len(self.sorted_words.words))] for sentence in self.text_c2: s_words = sentence.split(' ') for w in s_words: self.tf_c2[self.sorted_words.find(w)] += 1 def train(self): sum_tf_c1 = 0 sum_tf_c2 = 0 for i in range(len(self.word)): sum_tf_c1 += self.tf_c1[i] sum_tf_c2 += self.tf_c2[i] self.probability_c1 = [] self.probability_c2 = [] for i in range(len(self.sorted_words.words)): self.probability_c1.append( (self.tf_c1[i] + 1.0) / (len(self.sorted_words.words) + sum_tf_c1)) self.probability_c2.append( (self.tf_c2[i] + 1.0) / (len(self.sorted_words.words) + sum_tf_c2)) def get_rate_probability_c1(self, word): position = self.sorted_words.find(word) if word == self.sorted_words.words[position]: return self.probability_c1[position] / self.probability_c2[position] else: return 1 def test(self): for sentence in self.text_classify: self.test_doc(sentence) def test_doc(self, text): rate_px_c1 = 1 s_words = text.split(' ') for w in s_words: rate_px_c1 *= self.get_rate_probability_c1(w) if rate_px_c1 * self.p_c1 >= self.p_c2: print text + ': belong c1' else: print text + ': belong c2'