def getData(topK=20, withFlag=True, withWeight=False): jiebaAnalyse.set_stop_words("../resources/stopWord.txt") stopWords = set() with open("../resources/stopWord.txt", 'r', encoding='utf8') as f: for no, word in enumerate(f): stopWords.add(word) data = [] with open(data_file, 'r', encoding='utf8') as f: for no, line in enumerate(f): data.append(line.strip()) # data.extend(jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n'])) # text, weigth = jiebaAnalyse.extract_tags(" ".join(data), topK=20, allowPOS=['n'], withWeight=True) # print(text) # test allowPOS = frozenset(['n']) words = jieba.posseg.dt.cut(" ".join(data)) idf_freq, median_idf = IDFLoader(None or DEFAULT_IDF).get_idf() freq = {} word_count = {} for w in words: if allowPOS: if w.flag not in allowPOS: continue elif not withFlag: w = w.word wc = w.word if allowPOS and withFlag else w if len(wc.strip()) < 2 or wc.lower() in stopWords: continue freq[w] = freq.get(w, 0.0) + 1.0 word_count[wc] = word_count.get(wc, 1) + 1 total = sum(freq.values()) for k in freq: kw = k.word if allowPOS and withFlag else k freq[k] *= idf_freq.get(kw, median_idf) / total if withWeight: tags = sorted(freq.items(), key=itemgetter(1), reverse=True) else: tags = sorted(freq, key=freq.__getitem__, reverse=True) if topK: res = [] for w, weigth in tags[:topK]: if withWeight: res.append((w.word, word_count[w.word])) else: res.append((w, word_count[w])) res = sorted(res, key=itemgetter(1), reverse=True) return res else: return tags
def extract_tag(): ## 分词 ieba_fast jieba.load_userdict(dictionary) data = [] jiebaAnalyse.set_stop_words("../resources/stopWord.txt") with open(data_file, 'r', encoding='utf8') as f: for no, line in enumerate(f): data.append( jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n'])) print(data)
def wordCould(): from wordcloud import WordCloud data = [] jiebaAnalyse.set_stop_words(stop_words_path=stop_file) with open(data_file, 'r', encoding='utf8') as f: for no, line in enumerate(f): data.append(line.strip()) # data.extend(jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n'])) text = jiebaAnalyse.extract_tags(" ".join(data), topK=20, allowPOS=['n'], withWeight=True) # print(text) wordcloud = WordCloud(font_path='../resources/simsun.ttf').generate( " ".join(text)) import matplotlib.pyplot as plt plt.imshow(wordcloud, interpolation='bilinear') plt.imshow(wordcloud) plt.axis("off") plt.show()
from gevent.pywsgi import WSGIServer from jieba_fast.analyse.tfidf import IDFLoader, DEFAULT_IDF from constant.const import const from db.db_mongo import getMongoData from db.db_mysql import saveDataToDb, clearDbData, findAllShopId, findAllMallId from util.logger import logger from config.config import updateConfig, getConfig from datetime import datetime dictionary = '../resources/dict.txt' stop_file = '../resources/stopWord.txt' data_file = '../resources/data.txt' app = Flask(__name__) jiebaAnalyse.set_stop_words(stop_file) stopWords = set() # 读取文件 def readFile(filename, mode='r', encoding='utf8'): data = [] with open(filename, mode, encoding=encoding) as f: for line in f: data.append(line.strip()) # for no, word in enumerate(f): # data.append(word) return data # 获取结果
count = 0 for line in open('config/wordsCount.txt'): count = int(line) return count # 处理完一批后修正数量 def resetAllCount(nums): with open('config/wordsCount.txt', 'w') as f: f.write(str(nums)) if __name__ == '__main__': # 加载停用词 analyse.set_stop_words('config/stopWords.txt') # 起始索引 beginNum = getAllCount() # 获取微博ID ids = json.loads(getAllIdJson()) # 结束索引 endNum = len(ids) for index, _id in enumerate(ids[beginNum:endNum]): weibo = json.loads(getAllByIdJson(_id)) if 'nick_name' not in weibo: continue ID, name, vector, nums = getKeyWords(weibo) weiboNum = getWeiboCount()