def keyword_location(file_paht_list): dismiss_count = 0 output_dic = {} for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') location = getLocation(line_section) keyword_list = getKeywordList(line_section) current_text = getText(line_section) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): dismiss_count += 1 continue if (location not in output_dic.keys()): print(location) output_dic[location] = [ 0 for i in range(len(keywords_list)) ] for keyword in keyword_list: keyword = keyword.replace(',', '') output_dic[location][keywords_list.index(keyword)] += 1 e_t = time() print(output_dic) print('dismiss count: ' + str(dismiss_count)) print('current collection process time: ' + str(e_t - s_t))
def topic_friends(file_paht_list): dismiss_count = 0 output_dic = {} for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') friends = getFriends(line_section) current_text = getText(line_section) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): dismiss_count += 1 continue current_topic = getTopic(current_text) if (str(friends) not in output_dic.keys()): print(friends) output_dic[str(friends)] = [0 for i in range(len(topic))] output_dic[str(friends)][topic.index(current_topic)] += 1 e_t = time() print(output_dic) print('dismiss count: ' + str(dismiss_count)) print('current collection process time: ' + str(e_t - s_t)) # 存储 output_file = open('data/topic_friends.pickle', 'wb') pickle.dump(output_dic, output_file) output_file.close()
def topic_keyword(file_paht_list): dismiss_count = 0 output_dic = {} for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') keyword_list = getKeywordList(line_section) # print(keyword_list) current_text = getText(line_section) # dismiss weibo which is too short(<5) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): dismiss_count += 1 continue current_topic = getTopic(current_text) for keyword in keyword_list: keyword = keyword.replace(',', '') if (keyword not in output_dic.keys()): print(keyword) output_dic[keyword] = [0 for i in range(len(topic))] output_dic[keyword][topic.index(current_topic)] += 1 e_t = time() print('current file process time: ' + str(e_t - s_t)) print(output_dic) print('dismiss count: ' + str(dismiss_count))
def keyword_emotion(file_paht_list): output_dic = {} for current_keyword in keywords_list: output_dic[current_keyword] = {} for current_file in file_paht_list: print(current_file) with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') current_text = getText(line_section) # dismiss weibo which is too short(<5) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): continue mood = getMood(line_section) keyword_list = getKeywordList(line_section) # 统计所有 for keyword in keyword_list: keyword = keyword.replace(',', '') if (str(mood) not in output_dic[keyword].keys()): output_dic[keyword][str(mood)] = 0 else: output_dic[keyword][str(mood)] += 1 e_t = time() print('current file process time: ' + str(e_t - s_t)) print(output_dic) return output_dic
def extract_word_freq(file_folder_path): out_folder = 'data/location_wordfreq/' jieba.load_userdict("data/user_dict.txt") stop_word = [] with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f: for item in sw_f: stop_word.append(item.strip()) current_keyword = file_folder_path.split('/')[-1] if (current_keyword != '健康'): return print(current_keyword) current_keyword_cut_list = current_keyword.split(',') keyword_ban_list = [ban_word for ban_word in current_keyword_cut_list] for ban_word in current_keyword_cut_list: ban_word_list = jieba.cut(ban_word) for cut_ban_word in ban_word_list: if (cut_ban_word not in keyword_ban_list): keyword_ban_list.append(cut_ban_word) print(keyword_ban_list) file_name_list = os.listdir(file_folder_path) temp_list = [ file_folder_path + '/' + filename for filename in file_name_list ] for current_file in temp_list: #xxx.txt current_city_filename = current_file.split('/')[-1] with open(current_file, 'r', encoding='utf-8') as file: keyword_out_dic = {} weibo_cut_list = [] line_count = 0 for temp_line in file: weibo_origin = filer.filer(temp_line).replace('/', '') weibo_cut = list(jieba.cut(weibo_origin)) for items in weibo_cut: if ((items not in stop_word) and (items not in keyword_ban_list) and len(items.strip()) > 0): print(items) weibo_cut_list.append(items) line_count += 1 if (line_count >= 10000): fd = FreqDist(weibo_cut_list) # nltk库提供的词频统计 keyword_out_dic = merege_iter(keyword_out_dic, fd) line_count = 0 weibo_cut_list = [] if (line_count > 0): fd = FreqDist(weibo_cut_list) # nltk库提供的词频统计 keyword_out_dic = merege_iter(keyword_out_dic, fd) line_count = 0 weibo_cut_list = [] sort_dict = sorted(keyword_out_dic.items(), key=lambda d: d[1], reverse=True) temp_count = 0
def __init__(self, cfg, logger, job): self.starttime = time.time() self.cfg = cfg self.logger = logger self.job = job self.filename = '%s/%s__%s.job_history' % (cfg.run_dir, job['name'], job['env']) self.file = filer(cfg, logger, self.filename)
def keyword_mentionrepost(file_paht_list): dismiss_count = 0 all_keywords_list = [ '健康', '事业有成', '发展机会', '生活幸福', '有房', '出名', '家庭幸福', '好工作', '平等机会', '白手起家', '成为富人', '个体自由', '安享晚年', '收入足够', '个人努力', '祖国强大', '中国经济持续发展', '父辈更好' ] output_dic = {} output_dic['at'] = [0 for i in all_keywords_list] output_dic['at_total'] = [0 for i in all_keywords_list] output_dic['repost'] = [0 for i in all_keywords_list] output_dic['repost_total'] = [0 for i in all_keywords_list] print(output_dic) for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') location = getLocation(line_section) keyword_list = getKeywordList(line_section) current_text = getText(line_section) current_text = filer.filer(current_text) #先统计@率和转发率再移除@的符号 current_at = current_text.count('@') current_repost = current_text.count('//') current_text = remove_at(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): dismiss_count += 1 continue for keyword in keyword_list: keyword = keyword.replace(',', '') output_dic['at'][all_keywords_list.index( keyword)] += current_at output_dic['at_total'][all_keywords_list.index( keyword)] += 1 output_dic['repost'][all_keywords_list.index( keyword)] += current_repost output_dic['repost_total'][all_keywords_list.index( keyword)] += 1 e_t = time() print(output_dic) print('dismiss count: ' + str(dismiss_count)) print('current collection process time: ' + str(e_t - s_t))
def keyword_time(file_paht_list): city_list = [ '北京', '天津', '内蒙古', '新疆', '河北', '甘肃', '宁夏', '山西', '陕西', '青海', '山东', '河南', '安徽', '辽宁', '吉林', '黑龙江', '江苏', '浙江', '上海', '湖北', '湖南', '四川', '重庆', '贵州', '云南', '广西', '江西', '福建', '广东', '海南', '西藏', '台湾', '香港', '澳门', '海外', '其他' ] output_dic = {} for current_file in file_paht_list: print(current_file) with open(current_file, 'r', encoding='utf-8') as f: current_time = current_file.split('/')[-1].split('.')[0] print(current_time) if (current_time in output_dic.keys()): continue output_dic[current_time] = {} s_t = time() for line in f: line_section = line.split('\t') current_text = getText(line_section) # dismiss weibo which is too short(<5) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): continue location = getLocation(line_section) if (len(location) == 0): continue current_city_list = location.split() if (len(current_city_list) == 0): continue current_city = current_city_list[0] keyword_list = getKeywordList(line_section) # 统计所有 for keyword in keyword_list: keyword = keyword.replace(',', '') if (keyword not in output_dic[current_time].keys()): print(keyword) output_dic[current_time][keyword] = [ 0 for i in range(len(city_list)) ] output_dic[current_time][keyword][city_list.index( current_city)] += 1 e_t = time() print('current file process time: ' + str(e_t - s_t)) print(output_dic)
def predict(weibo): # print(weibo) weibo = filer.filer(weibo) word_list = [word for word in jieba.cut(weibo)] if len(word_list) < 5: return -1 result_list = list(pre_probability_dict.values()) for word in word_list: if word in pre_category_probability_dict: for i in range(0, len(category_list)): result_list[i] *= pre_category_probability_dict[word][i] max_category = 0 for i in range(0, len(category_list)): if result_list[i] > result_list[max_category]: max_category = i return category_list[max_category]
def keyword_percent(file_paht_list): dismiss_count = 0 output_dic_gt5 = {} output_dic_all = {} for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') current_text = getText(line_section) keyword_list = getKeywordList(line_section) # dismiss weibo which is too short(<5) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] # 统计所有 for keyword in keyword_list: keyword = keyword.replace(',', '') if (keyword not in output_dic_all.keys()): print(keyword) output_dic_all[keyword] = 0 output_dic_all[keyword] += 1 if (len(word_list) < 5): dismiss_count += 1 continue for keyword in keyword_list: keyword = keyword.replace(',', '') if (keyword not in output_dic_gt5.keys()): print(keyword) output_dic_gt5[keyword] = 0 output_dic_gt5[keyword] += 1 e_t = time() print('all') print(output_dic_all) print('greater than 5') print(output_dic_gt5) print('dismiss count: ' + str(dismiss_count)) print('current collection process time: ' + str(e_t - s_t))
def topic_percent(file_paht_list): dismiss_count = 0 output_list = [0 for i in range(len(topic))] for current_file in file_paht_list: with open(current_file, 'r', encoding='utf-8') as f: s_t = time() for line in f: line_section = line.split('\t') current_text = getText(line_section) # dismiss weibo which is too short(<5) current_text = filer.filer(current_text) word_list = [word for word in jieba.cut(current_text)] if (len(word_list) < 5): dismiss_count += 1 continue current_topic = getTopic(current_text) output_list[topic.index(current_topic)] += 1 e_t = time() print(output_list) print('dismiss count: ' + str(dismiss_count)) print('current collection process time: ' + str(e_t - s_t))
formatter_class=argparse.RawDescriptionHelpFormatter, description=DESC, epilog=textwrap.dedent(EPILOG) ) parser.add_argument("-c", "--config", type=str, metavar="config", help="job_tracker config file", default="job_tracker.cfg") parser.add_argument("-j", "--job", type=str, metavar="job", help="Job to analyze", required=True) parser.add_argument("-e", "--env", type=str, metavar="env", choices=['P', 'I'], default='I', help="Environment") # parse the args and call the appropriate command function args = parser.parse_args() cfg = setup_config() cfg.load_config(args.config) logger = logging.getLogger("/tmp/jt-analyzer.log") file_name = '%s/%s__%s.job_history' % (cfg.run_dir, args.job, args.env) file_handler = filer(cfg, logger, file_name) print "\nGetting log entries from %s\n" % file_name print "FROM - UNTIL : RESULT\n--------------------------------------------------------------" job_history = file_handler.read_content() for item in job_history: # item = dict(eval(item.rstrip())) # print item # item = dict(eval(item)) item['epoche_until'] = convert_epoche_to_timestamp(item['epoche_until']) item['epoche_from'] = convert_epoche_to_timestamp(item['epoche_from']) if item['epoche_from'] == item['epoche_until']:
parser.add_argument("-e", "--env", type=str, metavar="env", choices=['P', 'I'], default='I', help="Environment") # parse the args and call the appropriate command function args = parser.parse_args() cfg = setup_config() cfg.load_config(args.config) logger = logging.getLogger("/tmp/jt-analyzer.log") file_name = '%s/%s__%s.job_history' % (cfg.run_dir, args.job, args.env) file_handler = filer(cfg, logger, file_name) print "\nGetting log entries from %s\n" % file_name print "FROM - UNTIL : RESULT\n--------------------------------------------------------------" job_history = file_handler.read_content() for item in job_history: # item = dict(eval(item.rstrip())) # print item # item = dict(eval(item)) item['epoche_until'] = convert_epoche_to_timestamp( item['epoche_until']) item['epoche_from'] = convert_epoche_to_timestamp(item['epoche_from'])
def year_keyword_location_lda(current_year, current_keyword, mood_list='all'): jieba.load_userdict("data/user_dict.txt") stop_word = [] with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f: for item in sw_f: stop_word.append(item.strip()) current_keyword_cut_list = current_keyword.split(',') current_keyword_banned_list = [] # 全切 for temp1 in current_keyword_cut_list: cut_list = jieba.cut(temp1) for temp2 in cut_list: current_keyword_banned_list.append(temp2) time_keyword_folder = '/Volumes/data/chinadream/time_keyword_emotion_location' timemonth_list = os.listdir(time_keyword_folder) current_year_timemonth_list = [] for current_timemonth in timemonth_list: if (current_timemonth[0:4] == str(current_year)): current_year_timemonth_list.append(current_timemonth) corpus_text = [] corpus_city = {} count = 0 print('开始抽取语料') for current_timemonth in current_year_timemonth_list: if (mood_list == 'all'): current_mood_path = time_keyword_folder + '/' + current_timemonth current_mood_list = os.listdir(current_mood_path) for current_mood in current_mood_list: if (current_mood == '.DS_Store'): continue current_folder_keyword_list = os.listdir(current_mood_path + '/' + str(current_mood)) if (current_keyword not in current_folder_keyword_list): continue current_city_path = current_mood_path + '/' + str( current_mood) + '/' + current_keyword current_city_list = os.listdir(current_city_path) for current_city in current_city_list: if (current_city == '.DS_Store'): continue if (current_city in corpus_city): city_index = corpus_city[current_city] origin_text = corpus_text[city_index] else: origin_text = [] current_city_file_path = current_city_path + '/' + current_city for temp_line in open(current_city_file_path): weibo_origin = filer.filer(temp_line).replace('/', '') if (len(weibo_origin) == 0): continue weibo_cut = list(jieba.cut(weibo_origin)) weibo_cut_list = [] for items in weibo_cut: if (items not in stop_word and len(items.strip()) > 0): if (items in current_keyword_banned_list): continue weibo_cut_list.append(items) if (len(weibo_cut_list) < 5): continue for current_cut in weibo_cut_list: origin_text.append(current_cut) # 该城市没有符合条件的预料被抽取 if (len(origin_text) == 0): continue if (current_city in corpus_city): # 更新语料 corpus_text[city_index] = origin_text else: #添加语料,更新index corpus_text.append(origin_text) corpus_city[current_city] = count count += 1 #释放内存 del origin_text gc.collect() frequency = defaultdict(int) for city_file in corpus_text: for token in city_file: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 3] for text in corpus_text] word_count_dict = corpora.Dictionary(texts) corpus = [word_count_dict.doc2bow(text) for text in texts] print('计算tfidf') tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] del tfidf gc.collect() print('开始LDA模型构建') lda = LdaModel(corpus=corpus_tfidf, id2word=word_count_dict, num_topics=100) output_model_folder = 'data/year_keyword_location_lda/model/' + str( current_year) + '_' + current_keyword + '_' + mood_list if (not os.path.exists(output_model_folder)): os.makedirs(output_model_folder) model_file = output_model_folder + '/' + str( current_year) + '_' + current_keyword + '_' + mood_list + '_lda.model' dictionary_file = output_model_folder + '/' + str( current_year) + '_' + current_keyword + '_' + mood_list + '_lda.dict' corpus_file = output_model_folder + '/' + str( current_year) + '_' + current_keyword + '_' + mood_list + '_lda.mm' word_count_dict.save(dictionary_file) corpora.MmCorpus.serialize(corpus_file, corpus) lda.save(model_file) # db = conntoMongoKeywordLocation_topic() # current_collection = db[collection_name_1] # data_toinsert = { # 'topic_distri':'1', # 'keyword': current_keyword, # 'all_topic': str(lda.print_topics(-1)) # } # result = current_collection.insert_one(data_toinsert) # write to file output_folder = 'result/year_keyword_location_lda/' + str( current_year) + '_' + current_keyword + '_' + mood_list if (not os.path.exists(output_folder)): os.makedirs(output_folder) #默认20个话题 lda_topic_list = lda.print_topics() with open(output_folder + '/lda_topics.txt', 'a+', encoding='utf-8') as output_file: output_file.write( str(current_year) + '_' + current_keyword + '_' + mood_list) output_file.write('\n') for current_topic in lda_topic_list: output_file.write(str(current_topic)) output_file.write('\n') for city_name, city_index in corpus_city.items(): city_corpus = corpus[city_index] doc_lda = lda.get_document_topics(city_corpus) # 得到新文档的主题分布 # print(doc_lda) # db = conntoMongoKeywordLocation() # collection_name_1 = str(current_year) + '_' + current_keyword + '_' + mood_list # current_collection = db[collection_name_1] # data_toinsert = { # 'city': city_name, # 'topic_distribution': str(doc_lda) # } # result = current_collection.insert_one(data_toinsert) with open(output_folder + '/city_topics.txt', 'a+', encoding='utf-8') as output_file: output_city_name = city_name.split('.')[0] output_file.write(output_city_name) output_file.write('\n') output_file.write(str(doc_lda)) output_file.write('\n') return
def keyword_location_lda(mongo_server='127.0.0.1'): jieba.load_userdict("data/user_dict.txt") stop_word = [] weibocityfilefolder = '/Volumes/data/chinadream/city/' # keyword_finished = [] # db = conntoMongoKeywordLocation_topic() # for keyword_result in db['topic'].find(): # keyword_finished.append(keyword_result['keyword']) with open('data/stop_word.txt', 'r', encoding='utf-8') as sw_f: for item in sw_f: stop_word.append(item.strip()) keyword_folder = '/Volumes/data/chinadream/keyword_location/' folderlist = os.listdir(keyword_folder) for current_keyword in folderlist: # if(current_keyword in keyword_finished): # continue print(current_keyword) current_keyword_folder = keyword_folder + current_keyword + '/' current_keyword_cut_list = current_keyword.split(',') current_keyword_banned_list = [] if (current_keyword == '个人努力' or current_keyword == '健康' or (not os.listdir(current_keyword_folder))): continue # 全切 for temp1 in current_keyword_cut_list: cut_list = jieba.cut(temp1) for temp2 in cut_list: current_keyword_banned_list.append(temp2) current_city_file_list = os.listdir(current_keyword_folder) #corpus_text 里的每个list代表一个城市的所有文本 corpus_text = [] corpus_city = {} count = 0 for current_city_file in current_city_file_list: print(current_city_file) corpus_numbers = 0 origin_text = [] open_keyword_file_path = current_keyword_folder + current_city_file open_keyword_file = linecache.getlines(open_keyword_file_path) # open_keyword_file = open(open_keyword_file_path,'r',encoding='utf-8') temp_line_num = len(open_keyword_file) max_weiboDoc = 5000 if (temp_line_num < max_weiboDoc): for temp_line_lineNum in range(temp_line_num): temp_line = open_keyword_file[temp_line_lineNum] current_topic = getTopic(temp_line) if (current_topic == '娱乐'): continue weibo_origin = filer.filer(temp_line).replace('/', '') if (len(weibo_origin) == 0): continue weibo_cut = list(jieba.cut(weibo_origin)) weibo_cut_list = [] for items in weibo_cut: if (items not in stop_word and len(items.strip()) > 0): if (items in current_keyword_banned_list): continue weibo_cut_list.append(items) if (len(weibo_cut_list) < 5): continue for current_cut in weibo_cut_list: origin_text.append(current_cut) else: used_set = set() linenumber_list = [i for i in range(temp_line_num)] while (len(used_set) < max_weiboDoc or len(linenumber_list) > 0): if (len(linenumber_list) == 0): break elif (len(linenumber_list) == 1): a = 0 else: a = randint(0, len(linenumber_list) - 1) del linenumber_list[a] temp_line = open_keyword_file[a] current_topic = getTopic(temp_line) if (current_topic == '娱乐'): continue weibo_origin = filer.filer(temp_line).replace('/', '') if (len(weibo_origin) == 0): continue weibo_cut = list(jieba.cut(weibo_origin)) weibo_cut_list = [] for items in weibo_cut: if (items not in stop_word and len(items.strip()) > 0): if (items in current_keyword_banned_list): continue weibo_cut_list.append(items) if (len(weibo_cut_list) < 5): continue for current_cut in weibo_cut_list: origin_text.append(current_cut) used_set.add(a) if (len(origin_text) == 0): continue linecache.clearcache() print(len(origin_text)) corpus_city[current_city_file] = count corpus_text.append(origin_text) count += 1 del origin_text gc.collect() frequency = defaultdict(int) for city_file in corpus_text: for token in city_file: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 3] for text in corpus_text] word_count_dict = corpora.Dictionary(texts) corpus = [word_count_dict.doc2bow(text) for text in texts] print('计算tfidf') tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] del tfidf gc.collect() print('开始LDA模型构建') lda = LdaModel(corpus=corpus_tfidf, id2word=word_count_dict, num_topics=8) model_file = 'data/keyword_location/model/' + current_keyword + '_lda.model' lda.save(model_file) for city_name, city_index in corpus_city.items(): city_corpus = corpus[city_index] doc_lda = lda.get_document_topics(city_corpus) # 得到新文档的主题分布 db = conntoMongoKeywordLocation() current_collection = db[current_keyword] data_toinsert = { 'city': city_name, 'topic_distribution': str(doc_lda) } result = current_collection.insert_one(data_toinsert) db = conntoMongoKeywordLocation_topic() current_collection = db['topic'] data_toinsert = { 'keyword': current_keyword, 'all_topic': str(lda.print_topics(-1)) } result = current_collection.insert_one(data_toinsert) #write to file # output_file = codecs.open('result/keyword_location/city_topic/' + current_keyword + '_city_topics.txt', 'a+', encoding='utf-8') # output_file.write(city_name) # output_file.write('\t') # output_file.write(str(doc_lda)) # output_file.write('\n') return