def get_extend(all_set): extend_result = set() retweet_comment_dict_list = [] #step0: random get user user_count = len(all_set) all_user_list = list(all_set) if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list): silce = all_user_list else: silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE) db_number = get_db_num() #step1: get retweet retweet_index_name = retweet_index_name_pre + str(db_number) try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\ body={'ids': silce})['docs'] except: retweet_result = [] #step1.2: get uid retweet for retweet_item in retweet_result: try: if retweet_item['found'] == True: uid_retweet_dict = retweet_item['_source']['uid_retweet'] retweet_comment_dict_list.append(json.loads(uid_retweet_dict)) except: pass #step2: get comment comment_index_name = comment_index_name_pre + str(db_number) try: comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\ body={'ids': silce})['docs'] except: comment_result = [] #step2.2: get uid commnt for comment_item in comment_result: try: if comment_item['found'] == True: retweet_comment_dict_list.append( json.loads(comment_item['_source']['uid_comment'])) except: pass #step3: union dict list union_retweet_comment_list = union_dict(retweet_comment_dict_list) #step4: filter in user portrait extend_result = filter_out(union_retweet_comment_list.keys()) return extend_result
def get_extend(all_set): extend_result = set() retweet_comment_dict_list = [] #step0: random get user user_count = len(all_set) all_user_list = list(all_set) if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list): silce = all_user_list else: silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE) db_number = get_db_num() #step1: get retweet retweet_index_name = retweet_index_name_pre + str(db_number) try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\ body={'ids': silce})['docs'] except: retweet_result = [] #step1.2: get uid retweet for retweet_item in retweet_result: try: if retweet_item['found']==True: uid_retweet_dict = retweet_item['_source']['uid_retweet'] retweet_comment_dict_list.append(json.loads(uid_retweet_dict)) except: pass #step2: get comment comment_index_name = comment_index_name_pre + str(db_number) try: comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\ body={'ids': silce})['docs'] except: comment_result = [] #step2.2: get uid commnt for comment_item in comment_result: try: if comment_item['found'] == True: retweet_comment_dict_list.append(json.loads(comment_item['_source']['uid_comment'])) except: pass #step3: union dict list union_retweet_comment_list = union_dict(retweet_comment_dict_list) #step4: filter in user portrait extend_result = filter_out(union_retweet_comment_list.keys()) return extend_result
def get_community_coreuser_socail(uid_list, timestamp): uid2uname = get_user_name(uid_list) result = {} #step1: get db number # timestamp = int(time.time()) db_num = get_db_num(timestamp) retweet_index_name = retweet_index_name_pre + str(db_num) be_retweet_index_name = be_retweet_index_name_pre + str(db_num) comment_index_name = comment_index_name_pre + str(db_num) be_comment_index_name = be_comment_index_name_pre + str(db_num) #step2: split uid list to iter mget iter_count = 0 all_user_count = len(uid_list) in_stat_results = dict() out_stat_result = dict() all_in_record = [] all_out_record = [] all_out_user_count = 0 all_out_in_usr_count = 0 while iter_count < all_user_count: # iter_uid_list = uid_list iter_uid_list = uid_list[iter_count:iter_count + GROUP_ITER_COUNT] #step3:mget retweet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':iter_uid_list})['docs'] except: retweet_result = [] retweet_dict = {} #{uid1: {ruid1:count1, ruid2:count2}, uid2:{},...} for item in retweet_result: uid = item['_id'] #tesit for error es try: if item['found'] == True: retweet_dict[uid] = json.loads( item['_source']['uid_retweet']) except: pass #step4:mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':iter_uid_list})['docs'] except: comment_result = [] comment_dict = {} #{uid1:{ruid1:count1, ruid2:count2},...} # print 'comment_result:',comment_result for item in comment_result: uid = item['_id'] try: if item['found'] == True: comment_dict[uid] = json.loads( item['_source']['uid_comment']) except: pass #step5:mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \ body={'ids':iter_uid_list})['docs'] except: be_retweet_result = [] be_retweet_dict = dict() #{uid1: {uid_be_retweet dict}, uid2:{},...} for item in be_retweet_result: uid = item['_id'] #test for error es try: if item['found'] == True: be_retweet_dict[uid] = json.loads( item['_source']['uid_be_retweet']) except: pass #step6:mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type,\ body={'ids':iter_uid_list})['docs'] except: be_comment_result = [] be_comment_dict = dict() #{uid1:{uid_be_comment dict}, uid2:{},...} for item in be_comment_result: uid = item['_id'] #test for error es try: if item['found'] == True: be_comment_dict[uid] = json.loads( item['_source']['uid_be_comment']) except: pass #step7:union retweet&comment, be_retweet&be_comment for iter_uid in iter_uid_list: try: user_retweet_result = retweet_dict[iter_uid] except: user_retweet_result = {} try: user_comment_result = comment_dict[iter_uid] except: user_comment_result = {} filter_in_dict, filter_out_dict = filter_union_dict( [user_retweet_result, user_comment_result], uid_list, 'in&out') #step8: record the retweet/coment relaton in group uid # # uid_in_record = [[iter_uid, ruid, filter_in_dict[ruid], uid2uname[iter_uid], uid2uname[ruid]] for ruid in filter_in_dict if iter_uid != ruid] # # print 'filter_in_dict:',filter_in_dict # print 'filter_out_dict:',filter_out_dict # uid_in_record = [[iter_uid,uid2uname[iter_uid],ruid,uid2uname[ruid],filter_in_dict[ruid]] for ruid in filter_in_dict if iter_uid != ruid] uid_in_record = [] for ruid in filter_in_dict: # print 'ruid:',ruid item_list = [] if iter_uid != ruid: # print 'aaaa' item_list.append(iter_uid) if uid2uname.has_key(iter_uid): iter_name = uid2uname[iter_uid] else: iter_name = iter_uid item_list.append(iter_name) item_list.append(ruid) if uid2uname.has_key(ruid): ruid_name = uid2uname[ruid] else: ruid_name = ruid item_list.append(ruid_name) item_list.append(filter_in_dict[ruid]) if item_list: uid_in_record.append(item_list) else: pass else: pass # print 'item_list:',item_list # print 'uid_in_record:',uid_in_record # in_record = [uid_in_record] all_in_record.extend( uid_in_record ) # [[uid1, ruid1, count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...] #step9: record the retweet/comment/be_retweet/be_comment relation out group uid try: user_be_retweet_result = be_retweet_dict[iter_uid] except: user_be_retweet_result = {} try: user_be_comment_result = be_comment_dict[iter_uid] except: user_be_comment_result = {} filter_out_dict = filter_union_dict([ filter_out_dict, user_be_retweet_result, user_be_comment_result ], uid_list, 'out') #step10: filter out user who is in user_portrait # uid_out_record = [[iter_uid,get_user_nickname(iter_uid),ruid,get_user_nickname(ruid),filter_out_dict[ruid]] for ruid in filter_out_dict if iter_uid != ruid] uid_out_record = [] for ruid in filter_out_dict: item_list = [] if iter_uid != ruid: item_list.append(iter_uid) if uid2uname.has_key(iter_uid): iter_name = uid2uname[iter_uid] else: iter_name = iter_uid item_list.append(iter_name) item_list.append(ruid) if uid2uname.has_key(ruid): ruid_name = uid2uname[ruid] else: ruid_name = ruid item_list.append(ruid_name) item_list.append(filter_out_dict[ruid]) if item_list: uid_out_record.append(item_list) else: pass else: pass # out_record = [uid_out_record] all_out_record.extend( uid_out_record ) #[[uid1, ruid1,count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...] iter_count += GROUP_ITER_COUNT #step11 sort interaction in group by retweet&comment count sort_in_record = sorted(all_in_record, key=lambda x: x[4], reverse=True) # print 'all_out_record::',all_out_record sort_out_record = sorted(all_out_record, key=lambda x: x[4], reverse=True) # print 'sort_in_record::',len(sort_in_record) # print 'sort_out_record::',len(sort_out_record) #core_uidlist,outer_uidlist,community_dict['core_user_socail'],community_dict['core_outer_socail'] core_user_socail = [item for item in sort_in_record if item[4] > 2] core_uidlist = list(set([item[0] for item in core_user_socail])) # print 'core_uidlist:',core_uidlist # print 'core_user_socail:',core_user_socail core_outer_socail_temp = [ item for item in sort_out_record if (len(list(set(item[2].split()) & set(core_uidlist))) > 0 or len(list(set(item[0].split()) & set(core_uidlist))) > 0) and item[4] > 10 ] # core_outer_socail = [item for item in sort_out_record if (len(list(set(item[2])&set(core_uidlist)))>0 or len(list(set(item[0].split())&set(core_uidlist)))>0)] core_outer_socail = sorted(core_outer_socail_temp, key=lambda x: x[4], reverse=True)[0:30] # print 'core_outer_socail::',core_outer_socail outer_uidlist = [item[0] for item in core_outer_socail] # print 'core_user_socail:',type(core_user_socail) # print 'core_outer_socail:',type(core_outer_socail) # print 'core_uidlist::',type(core_uidlist) # print 'outer_uidlist::',type(outer_uidlist) # return json.loads(core_uidlist),json.loads(outer_uidlist),json.loads(core_user_socail),json.loads(core_outer_socail) return core_uidlist, outer_uidlist, core_user_socail, core_outer_socail
def detect_by_seed_users(seed_users): retweet_mark == 1 comment_mark == 1 group_uid_list = [] all_union_result_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #step1: mget retweet and be_retweet if retweet_mark == 1: retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: retweet_result = [] #mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \ body={'ids':seed_users} ,_source=True)['docs'] except: be_retweet_result = [] #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':seed_users}, _source=True)['docs'] except: be_comment_result = [] #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in seed_users: try: uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) all_union_result_dict[iter_search_uid] = union_result ''' !!!! 有一个转化提取 从 all_union_result_dict 中提取 所有的uid ''' return group_uid_list
def get_structure_user(seed_uid_list, structure_dict, filter_dict): structure_user_dict = {} retweet_mark = int(structure_dict['retweet']) comment_mark = int(structure_dict['comment']) hop = int(structure_dict['hop']) retweet_user_dict = {} comment_user_dict = {} #get retweet/comment es db_number now_ts = time.time() db_number = get_db_num(now_ts) #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop iter_hop_user_list = seed_uid_list iter_count = 0 all_union_result = dict() while iter_count < hop: # hop number control iter_count += 1 search_user_count = len(iter_hop_user_list) hop_union_result = dict() iter_search_count = 0 while iter_search_count < search_user_count: iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT] #step1: mget retweet and be_retweet if retweet_mark == 1: retweet_index_name = retweet_index_name_pre + str(db_number) be_retweet_index_name = be_retweet_index_name_pre + str(db_number) #mget retwet try: retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: retweet_result = [] #mget be_retweet try: be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \ body={'ids':iter_search_user_list} ,_source=True)['docs'] except: be_retweet_result = [] #step2: mget comment and be_comment if comment_mark == 1: comment_index_name = comment_index_name_pre + str(db_number) be_comment_index_name = be_comment_index_name_pre + str(db_number) #mget comment try: comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: comment_result = [] #mget be_comment try: be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \ body={'ids':iter_search_user_list}, _source=True)['docs'] except: be_comment_result = [] #step3: union retweet/be_retweet/comment/be_comment result union_count = 0 for iter_search_uid in iter_search_user_list: try: uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet']) except: uid_retweet_dict = {} try: uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet']) except: uid_be_retweet_dict = {} try: uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment']) except: uid_comment_dict = {} try: uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment']) except: uid_be_comment_dict = {} #union four type user set union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict) hop_union_result = union_dict(hop_union_result, union_result) #step4: add iter search count iter_search_count += DETECT_ITER_COUNT #pop seed uid self for iter_hop_user_item in iter_hop_user_list: try: hop_union_result.pop(iter_hop_user_item) except: pass #get new iter_hop_user_list iter_hop_user_list = hop_union_result.keys() #get all union result all_union_result = union_dict(all_union_result, hop_union_result) #step5: identify the who is in user_portrait sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True) iter_count = 0 all_count = len(sort_all_union_result) in_portrait_result = [] filter_importance_from = filter_dict['importance']['gte'] filter_importance_to = filter_dict['importance']['lt'] filter_influence_from = filter_dict['influence']['gte'] filter_influence_to = filter_dict['influence']['lt'] while iter_count < all_count: iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body={'ids':iter_user_list}, _source=True)['docs'] except: portrait_result = [] for portrait_item in portrait_result: if portrait_item['found'] == True: if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to: if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to: uid = portrait_item['_id'] in_portrait_result.append(uid) if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND): break iter_count += DETECT_ITER_COUNT return in_portrait_result