def get_extend(all_set):
    extend_result = set()
    retweet_comment_dict_list = []
    #step0: random get user
    user_count = len(all_set)
    all_user_list = list(all_set)
    if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list):
        silce = all_user_list
    else:
        silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE)
    db_number = get_db_num()
    #step1: get retweet
    retweet_index_name = retweet_index_name_pre + str(db_number)
    try:
        retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\
                body={'ids': silce})['docs']
    except:
        retweet_result = []
    #step1.2: get uid retweet
    for retweet_item in retweet_result:
        try:
            if retweet_item['found'] == True:
                uid_retweet_dict = retweet_item['_source']['uid_retweet']
                retweet_comment_dict_list.append(json.loads(uid_retweet_dict))
        except:
            pass
    #step2: get comment
    comment_index_name = comment_index_name_pre + str(db_number)
    try:
        comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\
                body={'ids': silce})['docs']
    except:
        comment_result = []
    #step2.2: get uid commnt
    for comment_item in comment_result:
        try:
            if comment_item['found'] == True:
                retweet_comment_dict_list.append(
                    json.loads(comment_item['_source']['uid_comment']))
        except:
            pass
    #step3: union dict list
    union_retweet_comment_list = union_dict(retweet_comment_dict_list)
    #step4: filter in user portrait
    extend_result = filter_out(union_retweet_comment_list.keys())
    return extend_result
def get_extend(all_set):
    extend_result = set()
    retweet_comment_dict_list = []
    #step0: random get user
    user_count = len(all_set)
    all_user_list = list(all_set)
    if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list):
        silce = all_user_list
    else:
        silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE)
    db_number = get_db_num()
    #step1: get retweet
    retweet_index_name = retweet_index_name_pre + str(db_number)
    try:
        retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\
                body={'ids': silce})['docs']
    except:
        retweet_result = []
    #step1.2: get uid retweet
    for retweet_item in retweet_result:
        try:
            if retweet_item['found']==True:
                uid_retweet_dict = retweet_item['_source']['uid_retweet']
                retweet_comment_dict_list.append(json.loads(uid_retweet_dict))
        except:
            pass
    #step2: get comment
    comment_index_name = comment_index_name_pre + str(db_number)
    try:
        comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\
                body={'ids': silce})['docs']
    except:
        comment_result = []
    #step2.2: get uid commnt
    for comment_item in comment_result:
        try:
            if comment_item['found'] == True:
                retweet_comment_dict_list.append(json.loads(comment_item['_source']['uid_comment']))
        except:
            pass
    #step3: union dict list
    union_retweet_comment_list = union_dict(retweet_comment_dict_list)
    #step4: filter in user portrait
    extend_result = filter_out(union_retweet_comment_list.keys())
    return extend_result
Example #3
0
def get_community_coreuser_socail(uid_list, timestamp):
    uid2uname = get_user_name(uid_list)
    result = {}
    #step1: get db number
    # timestamp = int(time.time())
    db_num = get_db_num(timestamp)
    retweet_index_name = retweet_index_name_pre + str(db_num)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_num)
    comment_index_name = comment_index_name_pre + str(db_num)
    be_comment_index_name = be_comment_index_name_pre + str(db_num)
    #step2: split uid list to iter mget
    iter_count = 0
    all_user_count = len(uid_list)
    in_stat_results = dict()
    out_stat_result = dict()
    all_in_record = []
    all_out_record = []
    all_out_user_count = 0
    all_out_in_usr_count = 0
    while iter_count < all_user_count:
        # iter_uid_list = uid_list
        iter_uid_list = uid_list[iter_count:iter_count + GROUP_ITER_COUNT]
        #step3:mget retweet
        try:
            retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                             body={'ids':iter_uid_list})['docs']
        except:
            retweet_result = []
        retweet_dict = {}  #{uid1: {ruid1:count1, ruid2:count2}, uid2:{},...}
        for item in retweet_result:
            uid = item['_id']
            #tesit for error es
            try:
                if item['found'] == True:
                    retweet_dict[uid] = json.loads(
                        item['_source']['uid_retweet'])
            except:
                pass

        #step4:mget comment
        try:
            comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                             body={'ids':iter_uid_list})['docs']
        except:
            comment_result = []
        comment_dict = {}  #{uid1:{ruid1:count1, ruid2:count2},...}
        # print 'comment_result:',comment_result
        for item in comment_result:
            uid = item['_id']
            try:
                if item['found'] == True:
                    comment_dict[uid] = json.loads(
                        item['_source']['uid_comment'])
            except:
                pass
        #step5:mget be_retweet
        try:
            be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \
                                                body={'ids':iter_uid_list})['docs']
        except:
            be_retweet_result = []
        be_retweet_dict = dict()  #{uid1: {uid_be_retweet dict}, uid2:{},...}
        for item in be_retweet_result:
            uid = item['_id']
            #test for error es
            try:
                if item['found'] == True:
                    be_retweet_dict[uid] = json.loads(
                        item['_source']['uid_be_retweet'])
            except:
                pass
        #step6:mget be_comment
        try:
            be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type,\
                                                body={'ids':iter_uid_list})['docs']
        except:
            be_comment_result = []
        be_comment_dict = dict()  #{uid1:{uid_be_comment dict}, uid2:{},...}
        for item in be_comment_result:
            uid = item['_id']
            #test for error es
            try:
                if item['found'] == True:
                    be_comment_dict[uid] = json.loads(
                        item['_source']['uid_be_comment'])
            except:
                pass
        #step7:union retweet&comment, be_retweet&be_comment
        for iter_uid in iter_uid_list:
            try:
                user_retweet_result = retweet_dict[iter_uid]
            except:
                user_retweet_result = {}
            try:
                user_comment_result = comment_dict[iter_uid]
            except:
                user_comment_result = {}
            filter_in_dict, filter_out_dict = filter_union_dict(
                [user_retweet_result, user_comment_result], uid_list, 'in&out')

            #step8: record the retweet/coment relaton in group uid

            #
            # uid_in_record = [[iter_uid, ruid, filter_in_dict[ruid], uid2uname[iter_uid], uid2uname[ruid]] for ruid in filter_in_dict if iter_uid != ruid]
            #
            # print 'filter_in_dict:',filter_in_dict
            # print 'filter_out_dict:',filter_out_dict
            # uid_in_record = [[iter_uid,uid2uname[iter_uid],ruid,uid2uname[ruid],filter_in_dict[ruid]] for ruid in filter_in_dict if iter_uid != ruid]
            uid_in_record = []
            for ruid in filter_in_dict:
                # print 'ruid:',ruid
                item_list = []
                if iter_uid != ruid:
                    # print 'aaaa'
                    item_list.append(iter_uid)

                    if uid2uname.has_key(iter_uid):
                        iter_name = uid2uname[iter_uid]
                    else:
                        iter_name = iter_uid
                    item_list.append(iter_name)

                    item_list.append(ruid)

                    if uid2uname.has_key(ruid):
                        ruid_name = uid2uname[ruid]
                    else:
                        ruid_name = ruid
                    item_list.append(ruid_name)
                    item_list.append(filter_in_dict[ruid])

                    if item_list:
                        uid_in_record.append(item_list)
                    else:
                        pass
                else:
                    pass
                # print 'item_list:',item_list
            # print 'uid_in_record:',uid_in_record
            # in_record = [uid_in_record]
            all_in_record.extend(
                uid_in_record
            )  # [[uid1, ruid1, count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...]
            #step9: record the retweet/comment/be_retweet/be_comment relation out group uid
            try:
                user_be_retweet_result = be_retweet_dict[iter_uid]
            except:
                user_be_retweet_result = {}
            try:
                user_be_comment_result = be_comment_dict[iter_uid]
            except:
                user_be_comment_result = {}
            filter_out_dict = filter_union_dict([
                filter_out_dict, user_be_retweet_result, user_be_comment_result
            ], uid_list, 'out')
            #step10: filter out user who is in user_portrait

            # uid_out_record = [[iter_uid,get_user_nickname(iter_uid),ruid,get_user_nickname(ruid),filter_out_dict[ruid]] for ruid in filter_out_dict if iter_uid != ruid]
            uid_out_record = []
            for ruid in filter_out_dict:
                item_list = []
                if iter_uid != ruid:
                    item_list.append(iter_uid)
                    if uid2uname.has_key(iter_uid):
                        iter_name = uid2uname[iter_uid]
                    else:
                        iter_name = iter_uid
                    item_list.append(iter_name)

                    item_list.append(ruid)

                    if uid2uname.has_key(ruid):
                        ruid_name = uid2uname[ruid]
                    else:
                        ruid_name = ruid
                    item_list.append(ruid_name)
                    item_list.append(filter_out_dict[ruid])

                    if item_list:
                        uid_out_record.append(item_list)
                    else:
                        pass
                else:
                    pass
            # out_record = [uid_out_record]
            all_out_record.extend(
                uid_out_record
            )  #[[uid1, ruid1,count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...]
        iter_count += GROUP_ITER_COUNT

    #step11 sort interaction in group by retweet&comment count
    sort_in_record = sorted(all_in_record, key=lambda x: x[4], reverse=True)
    # print 'all_out_record::',all_out_record
    sort_out_record = sorted(all_out_record, key=lambda x: x[4], reverse=True)

    # print 'sort_in_record::',len(sort_in_record)
    # print 'sort_out_record::',len(sort_out_record)
    #core_uidlist,outer_uidlist,community_dict['core_user_socail'],community_dict['core_outer_socail']
    core_user_socail = [item for item in sort_in_record if item[4] > 2]
    core_uidlist = list(set([item[0] for item in core_user_socail]))
    # print 'core_uidlist:',core_uidlist
    # print 'core_user_socail:',core_user_socail
    core_outer_socail_temp = [
        item for item in sort_out_record
        if (len(list(set(item[2].split()) & set(core_uidlist))) > 0
            or len(list(set(item[0].split()) & set(core_uidlist))) > 0)
        and item[4] > 10
    ]
    # core_outer_socail = [item for item in sort_out_record if (len(list(set(item[2])&set(core_uidlist)))>0  or len(list(set(item[0].split())&set(core_uidlist)))>0)]
    core_outer_socail = sorted(core_outer_socail_temp,
                               key=lambda x: x[4],
                               reverse=True)[0:30]
    # print 'core_outer_socail::',core_outer_socail
    outer_uidlist = [item[0] for item in core_outer_socail]

    # print 'core_user_socail:',type(core_user_socail)
    # print 'core_outer_socail:',type(core_outer_socail)
    # print 'core_uidlist::',type(core_uidlist)
    # print 'outer_uidlist::',type(outer_uidlist)
    # return json.loads(core_uidlist),json.loads(outer_uidlist),json.loads(core_user_socail),json.loads(core_outer_socail)
    return core_uidlist, outer_uidlist, core_user_socail, core_outer_socail
Example #4
0
def detect_by_seed_users(seed_users):
    retweet_mark == 1
    comment_mark == 1

    group_uid_list = []
    all_union_result_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)

    #step1: mget retweet and be_retweet
    if retweet_mark == 1:
        retweet_index_name = retweet_index_name_pre + str(db_number)
        be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
        #mget retwet
        try:
            retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            retweet_result = []
        #mget be_retweet
        try:
            be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \
                                                body={'ids':seed_users} ,_source=True)['docs']
        except:
            be_retweet_result = []
    #step2: mget comment and be_comment
    if comment_mark == 1:
        comment_index_name = comment_index_name_pre + str(db_number)
        be_comment_index_name = be_comment_index_name_pre + str(db_number)
        #mget comment
        try:
            comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            comment_result = []
        #mget be_comment
        try:
            be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                            body={'ids':seed_users}, _source=True)['docs']
        except:
            be_comment_result = []
    
    #step3: union retweet/be_retweet/comment/be_comment result
    union_count = 0
    
    for iter_search_uid in seed_users:
        try:
            uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet'])
        except:
            uid_retweet_dict = {}
        try:
            uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet'])
        except:
            uid_be_retweet_dict = {}
        try:
            uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment'])
        except:
            uid_comment_dict = {}
        try:
            uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment'])
        except:
            uid_be_comment_dict = {}
        #union four type user set
        union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict)
        all_union_result_dict[iter_search_uid] = union_result

   
    '''
    !!!! 有一个转化提取 
    从 all_union_result_dict   中提取 所有的uid
    '''
    return group_uid_list
Example #5
0
def get_structure_user(seed_uid_list, structure_dict, filter_dict):
    structure_user_dict = {}
    retweet_mark = int(structure_dict['retweet'])
    comment_mark = int(structure_dict['comment'])
    hop = int(structure_dict['hop'])
    retweet_user_dict = {}
    comment_user_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop
    iter_hop_user_list = seed_uid_list
    iter_count = 0
    all_union_result = dict()
    while iter_count < hop:   # hop number control
        iter_count += 1
        search_user_count = len(iter_hop_user_list)
        hop_union_result = dict()
        iter_search_count = 0
        while iter_search_count < search_user_count:
            iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT]
            #step1: mget retweet and be_retweet
            if retweet_mark == 1:
                retweet_index_name = retweet_index_name_pre + str(db_number)
                be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
                #mget retwet
                try:
                    retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    retweet_result = []
                #mget be_retweet
                try:
                    be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \
                                                        body={'ids':iter_search_user_list} ,_source=True)['docs']
                except:
                    be_retweet_result = []
            #step2: mget comment and be_comment
            if comment_mark == 1:
                comment_index_name = comment_index_name_pre + str(db_number)
                be_comment_index_name = be_comment_index_name_pre + str(db_number)
                #mget comment
                try:
                    comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    comment_result = []
                #mget be_comment
                try:
                    be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                                    body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    be_comment_result = []
            #step3: union retweet/be_retweet/comment/be_comment result
            union_count = 0
            
            for iter_search_uid in iter_search_user_list:
                try:
                    uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet'])
                except:
                    uid_retweet_dict = {}
                try:
                    uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet'])
                except:
                    uid_be_retweet_dict = {}
                try:
                    uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment'])
                except:
                    uid_comment_dict = {}
                try:
                    uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment'])
                except:
                    uid_be_comment_dict = {}
                #union four type user set
                union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict)
                hop_union_result = union_dict(hop_union_result, union_result)
            #step4: add iter search count
            iter_search_count += DETECT_ITER_COUNT

        #pop seed uid self
        for iter_hop_user_item in iter_hop_user_list:
            try:
                hop_union_result.pop(iter_hop_user_item)
            except:
                pass
        #get new iter_hop_user_list
        iter_hop_user_list = hop_union_result.keys()
        #get all union result
        all_union_result = union_dict(all_union_result, hop_union_result)
    #step5: identify the who is in user_portrait
    sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True)
    iter_count = 0
    all_count = len(sort_all_union_result)
    in_portrait_result = []
    filter_importance_from = filter_dict['importance']['gte']
    filter_importance_to = filter_dict['importance']['lt']
    filter_influence_from = filter_dict['influence']['gte']
    filter_influence_to = filter_dict['influence']['lt']
    while iter_count < all_count:
        iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]]
        try:
            portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                    body={'ids':iter_user_list}, _source=True)['docs']
        except:
            portrait_result = []
        for portrait_item in portrait_result:
            if portrait_item['found'] == True:
                if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to:
                    if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to:
                        uid = portrait_item['_id']
                        in_portrait_result.append(uid)
        if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND):
            break
        iter_count += DETECT_ITER_COUNT

    return in_portrait_result