def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_' + str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_' + str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_' + str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_' + str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def cal_ave_weibo(): date = '2013-09-07' timestamp = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_' + str(timestamp), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: for i in range(0, 1): ts = timestamp - 24 * 3600 * i activity_dict_string = r_cluster.hget('activity_' + str(ts), uid) if activity_dict_string: activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) all_count += weibo_count ave_count = float(all_count) / scan_count print 'ave_count:', ave_count
def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_'+str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_'+str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_'+str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def cal_ave_weibo(): date = '2013-09-07' timestamp = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_'+str(timestamp), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: for i in range(0,1): ts = timestamp - 24*3600*i activity_dict_string = r_cluster.hget('activity_'+str(ts), uid) if activity_dict_string: activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) all_count += weibo_count ave_count = float(all_count) / scan_count print 'ave_count:', ave_count
def main(): if RUN_TYPE: now_ts = time.time() - DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2016-03-16')) now_ts = int(ts) print now_ts sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts + "_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_" + str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids": uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads( sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.remove(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[ update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item[ 'sensitive_day_change'] = current_sensitive_score - revise_item.get( former_sensitive_key, 0) revise_item[ 'sensitive_week_change'] = current_sensitive_score - revise_item.get( 'sensitive_week_ave', 0) revise_item[ 'sensitive_month_change'] = current_sensitive_score - revise_item.get( 'sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[ update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join( sensitive_words_dict.keys()) revise_item[ 'sensitive_day_change'] = current_sensitive_score revise_item[ 'sensitive_week_change'] = current_sensitive_score revise_item[ 'sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item[ 'sensitive_week_var'], revise_item[ 'sensitive_week_sum'] = compute_week( revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item[ 'sensitive_month_var'], revise_item[ 'sensitive_month_sum'] = compute_month( revise_item, now_ts) action = {'index': {'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) print iter_count
def main(): if RUN_TYPE: now_ts = time.time()-DAY # 前一天 ts = str(datetime2ts(ts2datetime(now_ts))) else: ts = str(datetime2ts('2013-09-07')) now_ts = int(ts) sensitive_string = "sensitive_" + ts date_string = ts update_sensitive_key = "sensitive_score_" + ts # 更新的键 sensitive_dict_key = "sensitive_dict_" + ts sensitive_string_key = "sensitive_string_" + ts sensitive_day_change_key = "sensitive_" + ts +"_day_change" del_month = datetime2ts(ts2datetime(now_ts - MONTH)) del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键 former_ts = int(ts) - DAY former_date = str(datetime2ts(ts2datetime(former_ts))) former_sensitive_key = "sensitive_score_" + former_date iter_count = 0 bulk_action = [] mappings(ES_SENSITIVE_INDEX) total_number = r.hlen(sensitive_string) scan_cursor = 0 print total_number while 1: re_scan = r.hscan(sensitive_string, scan_cursor, count=1000) scan_cursor = re_scan[0] if len(re_scan[1]) != 0: sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict uid_list = sensitive_info.keys() sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs'] if sensitive_results: for item in sensitive_results: uid = item['_id'] sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads current_sensitive_score = 0 for k,v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)] if item['found']: # 之前存在相关信息 revise_item = item["_source"] if del_sensitive_key in revise_item: item.pop(del_sensitive_key) revise_item['uid'] = uid # 新更新的敏感度 revise_item[update_sensitive_key] = current_sensitive_score # 新更新的敏感词 revise_item[sensitive_dict_key] = sensitive_info[uid] # 新更新的string revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) # 当天和之前一天、一周和一月均值的差异 revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0) revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0) revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0) # 更新后week、month的均值和方差 revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) else: revise_item = dict() revise_item['uid'] = uid revise_item[update_sensitive_key] = current_sensitive_score revise_item[sensitive_dict_key] = sensitive_info[uid] revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys()) revise_item['sensitive_day_change'] = current_sensitive_score revise_item['sensitive_week_change'] = current_sensitive_score revise_item['sensitive_month_change'] = current_sensitive_score revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts) revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts) action = {'index':{'_id': uid}} bulk_action.extend([action, revise_item]) iter_count += 1 if iter_count % 1000 == 0: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX) bulk_action = [] print iter_count if int(scan_cursor) == 0: break if bulk_action: es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)