def norm_train(self, class0, class1): p = prop.Process(class0, class1) self.train = p.preprocess() self.mu = self.train.mean() self.sigma = self.train.std() self.norm_train, self.label_train = p.normalize(flag=True) return self.norm_train, self.label_train
def post_process(sent, key, username): sub_event_dict = {key: {}} #Tokenize the words toks = nk.word_tokenize(sent) #Part-of-speech tag the tokens tags = nk.pos_tag(toks) pp = preprocess.Process(tags) lat, lon = pp.geolocate(username) sub_event_dict[key]['lat'] = lat sub_event_dict[key]['lon'] = lon sub_event_dict[key]['num_involved'] = pp.num_involved() return sub_event_dict
def norm_test(self, class0, class1): p = prop.Process(class0, class1) self.test = p.preprocess() self.norm_test, self.label_test = p.normalize(self.mu, self.sigma, flag=False) return self.norm_test, self.label_test
feature_words = words_dict(all_words_list, deleteN, stopwords_set) train_feature_list, test_feature_list = TextFeatures( train_data_list, test_data_list, feature_words) classifier = MultinomialNB().fit(train_feature_list, train_class_list) train_feature_list, predict_feature_list = TextFeatures( train_data_list, longtext, feature_words) class_typetmp = classifier.predict(predict_feature_list)[0] class_listfile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/classlist.txt" class_transfer = {} with open(class_listfile, 'r') as f: for line in f.readlines(): l = line.split(" ") class_transfer[l[0]] = l[1] class_type = class_transfer[class_typetmp] return class_type if __name__ == '__main__': print "start" idffile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/poem/idf.txt" predict_poem = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/sample/C00001/76.txt" tfidf = TFIDFunction.TFIDF(idffile) sentence = preprocess.Process(predict_poem) tags = tfidf.extract_keywords(sentence) Bay = Bayes() class_type = Bay.classify(tags) print class_type
self.idf_freq = self.idf_loader.idf_freq self.mean_idf = self.idf_loader.mean_idf def extract_keywords(self, sentence, topK=20): # 提取关键词 # 过滤 #seg_list = segment(sentence) jieba = jiebaFunction.jieba() seg_list, seg_count = jieba.PoemSegment(sentence) freq = {} for w in seg_list: freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: # 计算 TF-IDF freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序 if topK: return tags[:topK] else: return tags idffile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/poem/idf.txt" document = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/sample/C00001/76.txt" topK = 10 tdidf = TFIDF(idffile) sentence = preprocess.Process(document) print sentence tags = tdidf.extract_keywords(sentence) for tag in tags: print tag.encode('utf-8')