Beispiel #1
0
 def norm_train(self, class0, class1):
     p = prop.Process(class0, class1)
     self.train = p.preprocess()
     self.mu = self.train.mean()
     self.sigma = self.train.std()
     self.norm_train, self.label_train = p.normalize(flag=True)
     return self.norm_train, self.label_train
Beispiel #2
0
def post_process(sent, key, username):
    sub_event_dict = {key: {}}
    #Tokenize the words
    toks = nk.word_tokenize(sent)
    #Part-of-speech tag the tokens
    tags = nk.pos_tag(toks)
    pp = preprocess.Process(tags)
    lat, lon = pp.geolocate(username)
    sub_event_dict[key]['lat'] = lat
    sub_event_dict[key]['lon'] = lon

    sub_event_dict[key]['num_involved'] = pp.num_involved()

    return sub_event_dict
Beispiel #3
0
 def norm_test(self, class0, class1):
     p = prop.Process(class0, class1)
     self.test = p.preprocess()
     self.norm_test, self.label_test = p.normalize(self.mu, self.sigma, flag=False)
     return self.norm_test, self.label_test
            feature_words = words_dict(all_words_list, deleteN, stopwords_set)
            train_feature_list, test_feature_list = TextFeatures(
                train_data_list, test_data_list, feature_words)
        classifier = MultinomialNB().fit(train_feature_list, train_class_list)
        train_feature_list, predict_feature_list = TextFeatures(
            train_data_list, longtext, feature_words)
        class_typetmp = classifier.predict(predict_feature_list)[0]

        class_listfile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/classlist.txt"
        class_transfer = {}
        with open(class_listfile, 'r') as f:
            for line in f.readlines():
                l = line.split(" ")
                class_transfer[l[0]] = l[1]
        class_type = class_transfer[class_typetmp]
        return class_type


if __name__ == '__main__':

    print "start"

    idffile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/poem/idf.txt"
    predict_poem = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/sample/C00001/76.txt"
    tfidf = TFIDFunction.TFIDF(idffile)
    sentence = preprocess.Process(predict_poem)
    tags = tfidf.extract_keywords(sentence)
    Bay = Bayes()
    class_type = Bay.classify(tags)
    print class_type
        self.idf_freq = self.idf_loader.idf_freq
        self.mean_idf = self.idf_loader.mean_idf

    def extract_keywords(self, sentence, topK=20):  # 提取关键词
        # 过滤
        #seg_list = segment(sentence)
        jieba = jiebaFunction.jieba()
        seg_list, seg_count = jieba.PoemSegment(sentence)
        freq = {}
        for w in seg_list:
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:  # 计算 TF-IDF
            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total
        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序
        if topK:
            return tags[:topK]
        else:
            return tags


idffile = "/Users/wcswang/Desktop/GraPro/Poetic_Language/poem/idf.txt"
document = "/Users/wcswang/Desktop/GraPro/Poetic_Language/Bayes/database/sample/C00001/76.txt"
topK = 10
tdidf = TFIDF(idffile)
sentence = preprocess.Process(document)
print sentence
tags = tdidf.extract_keywords(sentence)
for tag in tags:
    print tag.encode('utf-8')