def main(): f = open( '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'wb') writer = csv.writer(f) scan_count = 0 scan_cursor = 0 all_count = 0 dis_dict = dict() fw_dict = dict() n_dict = dict() save_list = [] range_list = [100000, 10000, 1000, 100, 0] while 1: if scan_count == 1000000: break results = r.scan(scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for result in results[1]: if 'be_retweet_' == result[:11]: uid = result[11:] be_retweet_dict = r.hgetall(result) be_retweet_user_count = len(be_retweet_dict) all_count += be_retweet_user_count try: dis_dict[str(be_retweet_user_count)] += 1 except: dis_dict[str(be_retweet_user_count)] = 1 if be_retweet_user_count >= 1000: save_list.append((uid, be_retweet_user_count)) sort_save_list = sorted(save_list, key=lambda x: x[1], reverse=True) for item in sort_save_list: uid = item[0] count = item[1] writer.writerow([uid, item]) for rf in dis_dict: dis_count = dis_dict[rf] for range_up in range_list: if int(rf) > range_up: try: fw_dict[str(range_up)] += int(rf) except: fw_dict[str(range_up)] = int(rf) try: n_dict[str(range_up)] += dis_count except: n_dict[str(range_up)] = dis_count print 'Rf, N, Fw:' for range_up in range_list: try: n = n_dict[str(range_up)] fw = fw_dict[str(range_up)] except: n = 0 fw = 0 print range_up, n, fw f.close()
def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_' + str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_' + str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_' + str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_' + str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_'+str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_'+str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_'+str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def main(): f = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'wb') writer = csv.writer(f) scan_count = 0 scan_cursor = 0 all_count = 0 dis_dict = dict() fw_dict = dict() n_dict = dict() save_list = [] range_list = [100000,10000,1000,100,0] while 1: if scan_count == 1000000: break results = r.scan(scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for result in results[1]: if 'be_retweet_' == result[:11]: uid = result[11:] be_retweet_dict = r.hgetall(result) be_retweet_user_count = len(be_retweet_dict) all_count += be_retweet_user_count try: dis_dict[str(be_retweet_user_count)] += 1 except: dis_dict[str(be_retweet_user_count)] = 1 if be_retweet_user_count >= 1000: save_list.append((uid, be_retweet_user_count)) sort_save_list = sorted(save_list, key=lambda x:x[1], reverse=True) for item in sort_save_list: uid = item[0] count = item[1] writer.writerow([uid, item]) for rf in dis_dict: dis_count = dis_dict[rf] for range_up in range_list: if int(rf) > range_up: try: fw_dict[str(range_up)] += int(rf) except: fw_dict[str(range_up)] = int(rf) try: n_dict[str(range_up)] += dis_count except: n_dict[str(range_up)] = dis_count print 'Rf, N, Fw:' for range_up in range_list: try: n = n_dict[str(range_up)] fw = fw_dict[str(range_up)] except: n = 0 fw = 0 print range_up, n, fw f.close()
def cal_core_class(): date = '2013-09-07' timestamp = datetime2ts(date) f_r = open( '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb') reader = csv.reader(f_r) f_w = open( '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb') writer = csv.writer(f_w) result_list = [] count011 = 0 for line in reader: uid = line[0] retweet_results = r.hgetall('retweet_' + str(uid)) retweet_count = len(retweet_results) be_retweet_results = r.hgetall('be_retweet_' + str(uid)) be_retweet_count = len(be_retweet_results) weibo_count = 0 for i in range(0, 7): ts = timestamp - 24 * 3600 * i activity_string = r_cluster.hget('activity_' + str(ts), str(uid)) if activity_string: activity_dict = json.loads(activity_string) else: activity_dict = {} for time_seg in activity_dict: count = activity_dict[time_seg] weibo_count += count ave_weibo_count = float(weibo_count) / 7 if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' if ave_weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' key = indic_1 + indic_2 + indic_3 if key == '011': count011 += 1 result_list.append( [uid, key, retweet_count, be_retweet_count, ave_weibo_count]) f_r.close() sort_result = sorted(result_list, key=lambda x: x[3], reverse=True) for item in sort_result: writer.writerow(list(item)) f_w.close() print 'count011:', count011
def cal_core_class(): date = '2013-09-07' timestamp = datetime2ts(date) f_r = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb') reader = csv.reader(f_r) f_w = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb') writer = csv.writer(f_w) result_list = [] count011 = 0 for line in reader: uid = line[0] retweet_results = r.hgetall('retweet_'+str(uid)) retweet_count = len(retweet_results) be_retweet_results = r.hgetall('be_retweet_'+str(uid)) be_retweet_count = len(be_retweet_results) weibo_count = 0 for i in range(0,7): ts = timestamp - 24*3600*i activity_string = r_cluster.hget('activity_'+str(ts), str(uid)) if activity_string: activity_dict = json.loads(activity_string) else: activity_dict = {} for time_seg in activity_dict: count = activity_dict[time_seg] weibo_count += count ave_weibo_count = float(weibo_count) / 7 if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' if ave_weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' key = indic_1 + indic_2 + indic_3 if key=='011': count011 += 1 result_list.append([uid, key, retweet_count, be_retweet_count, ave_weibo_count]) f_r.close() sort_result = sorted(result_list, key=lambda x:x[3], reverse=True) for item in sort_result: writer.writerow(list(item)) f_w.close() print 'count011:', count011
def cal_ave_fans(): # test there should use r_dict scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r.scan(scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for result in results[1]: if 'be_retweet_' == result[:11]: uid = result[11:] retweet_dict = r.hgetall(result) retweet_user_count = len(retweet_dict) all_count += retweet_user_count ave_count = float(all_count) / scan_count print 'ave_count:', ave_count