def wd_seg(self): def _file_seg_(filename): _f = codecs.open(filename, 'r', 'utf-8') txt = _f.read() seg_list = [_wd for _wd in jieba.cut(txt)] # words account before delete stopwords _stopwd = len(seg_list) for sw in self.stop_wd_list: while sw.strip() in seg_list: seg_list.remove(sw.strip()) # words account before after stopwords stopwd_ = len(seg_list) _f.close() return seg_list, _stopwd, stopwd_ # use get_sets_of_root_path_tree so for iter_filename in self.set_dict['files']: log_dir = '\\'.join(iter_filename.split('.')[0].split('\\')[:-3]) + '-seg\\' + \ '\\'.join(iter_filename.split('.')[0].split('\\')[-3:-1]) if not os.path.exists(log_dir): os.makedirs(log_dir) log_name = iter_filename.split('.')[0].split('\\')[-1] + '.log' wd_list, _wds, wds_ = _file_seg_(iter_filename) Log.log_blue_running("Processed " + self.set_dict['category'] + ' - ' + iter_filename + ',' + ' words account reduced from ' + str(_wds) + ' to ' + str(wds_)) f = codecs.open(log_dir + '\\' + log_name, 'w', 'utf-8') for wd in wd_list: if wd != u' ' and wd != u'\n': f.write(wd) f.write(' ') f.close()
def text_classification_polynomial(self, test_category): logfile = open('tc_log.log', 'a') prior_pos = self._get_prior_possibility_() def p_possibility(p_pos_list): log_p_pos = map(math.log, p_pos_list) log_p = reduce(lambda x, y: x + y, log_p_pos) return log_p def classification(p_possibility_list, category): p = max([ p_possibility_list[_iter_] for _iter_ in range(len(p_possibility_list)) ]).values()[0] return p, p == category test_num = len(self._test_set_dict_[test_category]) if_right = 0 for _iter_file in self._test_set_dict_[test_category]: wd_list = self.get_wd_list(_iter_file) category_pos = [] for _iter_category in range(len(self._wd_list_category_)): wd_account = reduce( lambda x, y: x + y, self._wd_list_category_[_iter_category]['words'].values()) _p_category = [] denominator = wd_account + len(self._wd_list_total_) for wd in wd_list.keys(): if wd in self._wd_list_category_[_iter_category]['words']: nij = self._wd_list_category_[_iter_category]['words'][ wd] else: nij = 0 # every single word count for once for _iter_account_ in range(wd_list[wd]): _p = (nij + 1) * 1.0 / denominator _p_category.append(_p) category_pos.append({ p_possibility(_p_category) + math.log(prior_pos[_iter_category]['prior_possibility']): self._wd_list_category_[_iter_category]['category'] }) if classification(category_pos, test_category)[1]: if_right += 1 Log.log_blue_running('Category: ' + test_category + ' total ' + str(test_num) + ', correct: ' + str(if_right) + ' Accuracy: ' + str((if_right * 1.0 / test_num) * 100) + '%') logfile.write('Category: ' + test_category + ' total ' + str(test_num) + ', correct: ' + str(if_right) + ' Accuracy: ' + str((if_right * 1.0 / test_num) * 100) + '%' + '\n') logfile.close()
def _bayes_train_(self): self._wd_list_category_ = [] if os.path.exists(self._cache_path_total_): Log.log_blue_running('Corpus cache loaded.') self._wd_list_total_ = self.load_cache(self._cache_path_total_) for _iter_cache_file in range(len(self._cache_path_category_)): self._wd_list_category_.append({ 'category': self._cache_path_category_[_iter_cache_file]['category'], 'words': self.load_cache( self._cache_path_category_[_iter_cache_file] ['cache_path']) }) else: Log.log_blue_running('Naive Bayes train module.') wd_list_total = {} for _iter_basename in self._train_set_dict_: characteristic_wd_category = {} for _iter_filename in self._train_set_dict_[_iter_basename]: tmp = TextClassification.get_wd_list(_iter_filename) for wd in tmp: if wd in wd_list_total: wd_list_total[wd] += tmp[wd] else: wd_list_total[wd] = tmp[wd] words_freq = OrderedDict( (sorted((copy.deepcopy(tmp).items()), key=lambda t: -t[-1]))).items() if len(words_freq) > 10: characteristic_wd_file = dict( (x, y) for x, y in words_freq[:10]) else: characteristic_wd_file = dict( (x, y) for x, y in words_freq) for wd in characteristic_wd_file: if wd in characteristic_wd_category: characteristic_wd_category[ wd] += characteristic_wd_file[wd] else: characteristic_wd_category[ wd] = characteristic_wd_file[wd] self.create_cache( characteristic_wd_category, self._train_set_dir_ + '\\' + _iter_basename + '.json') Log.log_blue_running('Category: ' + _iter_basename + ' cache created.') self._wd_list_total_ = wd_list_total self.create_cache( self._wd_list_total_, os.path.abspath(self._train_set_dir_ + '\\..') + '\\wdlist-total.json') Log.log_blue_running('Corpus word-lists cache created.')
def get_stop_wd_list(): with codecs.open('stopwd.txt', 'r', 'utf-8') as f: text = f.readlines() stop_wd_list = [word for word in text] Log.log_blue_running('Stopwords total : ' + str(len(stop_wd_list)) + '\n') return stop_wd_list