def train(self): data = [] with open(pos_path, encoding='utf8', mode='r') as f: for line in f.readlines(): data.append([filter_stop(jieba.cut(line.strip())), 'pos']) with open(neg_path, encoding='utf8', mode='r') as f: for line in f.readlines(): data.append([filter_stop(jieba.cut(line.strip())), 'neg']) self.classifier.train(data=data)
def load_paper_data(): data_list = [] with open(paper_path, encoding='gb2312', mode='r', errors='ignore') as f: for line in f.readlines(): t = filter_stop(filter(lambda x: x != '', map(lambda x: delete_punctuation(x.split('/')[0]), line.split()[1:]))) if t: data_list.append(t) return data_list
def train(self): data = [] for key, path in category_path.items(): for txt_path in fetch_file_path(path): current_path = os.path.join(path, txt_path) with open(current_path, mode='r', encoding='gb2312', errors='ignore') as f: data.append( [filter_stop(jieba.cut(''.join((map(lambda x: x.strip().replace(' ', ''), f.readlines()))))), key]) self.classifier.train(data=data)
def simhash(self, doc1, doc2): d1 = filter_stop(jieba.cut(doc1)) d2 = filter_stop(jieba.cut(doc2)) d1_dict = {} d2_dict = {} for word in d1: if word in d1_dict: d1_dict[word] += 1 else: d1_dict[word] = 1 for word in d2: if word in d2_dict: d2_dict[word] += 1 else: d2_dict[word] = 1 n1 = self.merge_word(d1_dict) n2 = self.merge_word(d2_dict) t = hamming(n1, n2) print(t)
def predict(self, key): self.load() key = filter_stop(jieba.cut(key)) dl, avgdl = self.get_dl_avgdl(key) K = self.get_K(dl, avgdl) max_doc_index, max_score = 0, 0 for doc_index in range(self.tf_idf.tf_list.shape[0]): score = 0 for k in key: one_word_score = self.word_score(k, doc_index, K) score += one_word_score if score > max_score: max_score, max_doc_index = score, doc_index print('max', max_doc_index, max_score) self.get_doc_word(max_doc_index)
def predict(self, sent): words = filter_stop(jieba.cut(sent)) return self.classifier.predict(words)
def classify(self, sent): filter_words = filter_stop(jieba.cut(sent.strip())) return self.classifier.predict(filter_words)