Exemple #1
0
def filter_retweet_count(user_set):
    FILTER_ITER_COUNT = 100
    results = []
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    # test
    search_user_count = len(user_set)
    iter_search_count = 0
    while iter_search_count < search_user_count:
        iter_search_user_list = user_set[iter_search_count:iter_search_count +
                                         FILTER_ITER_COUNT]
        try:
            retweet_result = es_retweet.mget(index = retweet_index_name, doc_type = retweet_index_type,\
                    body = {'ids':iter_search_user_list}, _source=True)['docs']
        except:
            retweet_result = []
        for retweet_item in retweet_result:
            if retweet_item['found']:
                retweet_set = set()
                user = retweet_item['_id']
                per_retweet_result = json.loads(
                    retweet_item['_source']['uid_retweet'])
                for retweet_user in per_retweet_result:
                    retweet_set.add(retweet_user)
                if len(retweet_set) < retweet_threshold:
                    results.append(user)
                else:
                    writer.writerow([user, 'retweet'])
            else:
                user = retweet_item['_id']
                results.append(user)

        iter_search_count += FILTER_ITER_COUNT
    return results
def filter_retweet_count(user_set):
    FILTER_ITER_COUNT = 100;
    results = []
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    retweet_index_name = retweet_index_name_pre + str(db_number)
    # test
    search_user_count = len(user_set);
    iter_search_count = 0
    while iter_search_count < search_user_count:
        iter_search_user_list = user_set[iter_search_count:iter_search_count + FILTER_ITER_COUNT]
        try:
            retweet_result = es_retweet.mget(index = retweet_index_name, doc_type = retweet_index_type,\
                    body = {'ids':iter_search_user_list}, _source=True)['docs']
        except:
            retweet_result = []
        if retweet_result:
            for retweet_item in retweet_result:
                if retweet_item['found']:
                    retweet_set = set()
                    user = retweet_item['_id']
                    per_retweet_result = json.loads(retweet_item['_source']['uid_retweet'])
                    for retweet_user in per_retweet_result:
                        retweet_set.add(retweet_user)
                    if len(retweet_set) < retweet_threshold:
                        results.append(user)
                    else:
                        writer.writerow([user, 'retweet'])
                else:
                    user = retweet_item['_id']
                    results.append(user)

            iter_search_count += FILTER_ITER_COUNT
    return results
Exemple #3
0
def extend_network(task_name, ts):

    index_name = task_name
    # mu qian can yu de yonghu shu
    query_uid = {
        "query":{
            "filtered":{
                "filter":{
                    "range":{
                        "timestamp":{
                            "lt": ts
                        }
                    }
                }
            }
        },
        "aggs":{
            "uid_count":{"cardinality":{"field": "uid"}}
        }
    }
    uid_count = es_prediction.search(index=index_name, doc_type="text", \
            body=query_uid)["aggregations"]["uid_count"]["value"]

    try:
        extend_retweet_threshold = float(r_stimulation.get("extend_retweet_threshold"))
    except:
        r_stimulation.set("extend_retweet_threshold", 10000)
        extend_retweet_threshold = 10000

    user_list = organize_network(task_name, ts)
    exist_user_set = set(user_list)
    in_user_list = list() ####已存在的用户列表
    in_user_info = []
    count = 0
    all_user_dict = dict() ## participate user >>> extended list
    list_len = len(user_list)
    len_1000 = list_len/1000
    for i in range(len_1000+1):
        tmp_uid = user_list[i*1000: (i+1)*1000]
        es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"]
        for item in es_results:
            if item["found"]:
                count +=1
                if count % 1000 == 0:
                    print "extend network: ", count
                uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                retweet_count = len(uid_be_retweet)
                if retweet_count < extend_retweet_threshold: # 对外扩展的阈值
                    continue
                uid_retweet_list = uid_be_retweet.keys()
                uid_retweet_list = list(set(uid_retweet_list)-exist_user_set)
                all_user_dict[item["_id"]] = uid_retweet_list # 扩展的用户
                retweet_count = len(uid_be_retweet)
                in_user_list.append(item["_id"])
                in_user_info.append([math.log(retweet_count+1), math.log(uid_count+1)])

    return uid_count,in_user_list, in_user_info, all_user_dict
def extend_network(task_name):
    file_name = task_name + ".txt"
    f = open(task_name + ".txt", "w")
    line = 0
    user2number_dict = dict()  # mapping: number-uid
    number2user_dict = dict()
    count = 0
    user_list = organize_network(task_name)
    list_len = len(user_list)
    len_1000 = list_len / 1000
    for i in range(len_1000 + 1):
        tmp_uid = user_list[i * 1000:(i + 1) * 1000]
        es_results = es_retweet.mget(index=index_be_retweet,
                                     doc_type=index_type_be_retweet,
                                     body={"ids": tmp_uid})["docs"]
        for item in es_results:
            if item["found"]:
                print count
                uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                be_retweet_list = uid_be_retweet.keys()
                uid = item["_id"]
                if user2number_dict.has_key(uid):
                    uid_count = user2number_dict[uid]
                else:
                    count += 1
                    uid_count = count
                    user2number_dict[uid] = count
                    number2user_dict[count] = uid
                for each in be_retweet_list:
                    if user2number_dict.has_key(each):
                        each_number = user2number_dict[each]
                    else:
                        count += 1
                        user2number_dict[each] = count
                        number2user_dict[count] = uid
                        each_number = count
                    if each_number != uid_count:
                        f.write(str(uid_count) + " " + str(each_number) + "\n")
                        line += 1

    f.close()
    cmd = 'sed -i "" -e "1i %s %s" %s' % (count, line, file_name)
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)

    es_prediction.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
            id=task_name, body={"doc":{"network_exist": "1"}})
    print "finish: ", count

    file_user = open("user_" + task_name + ".txt", "w")
    for uid in user2number_dict.keys():
        file_user.write(str(uid) + '\n')
def extend_network(task_name):

    # mu qian can yu de yonghu shu
    query_uid = {
        "query": {
            "range": {
                "timestamp": {
                    "lt": ts
                }
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    uid_count = es.search(index=index_name, doc_type="text", \
            body=query_uid)["aggregations"]["uid_count"]["value"]

    file_name = task_name + ".txt"
    f = open(task_name + ".txt", "w")
    line = 0
    count = 0
    user_list = organize_network(task_name)
    important_user_list = list()  ####
    important_user_info = []
    list_len = len(user_list)
    len_1000 = list_len / 1000
    for i in range(len_1000 + 1):
        tmp_uid = user_list[i * 1000:(i + 1) * 1000]
        es_results = es_retweet.mget(index=index_be_retweet,
                                     doc_type=index_type_be_retweet,
                                     body={"ids": tmp_uid})["docs"]
        for item in es_results:
            if item["found"]:
                print count
                uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                retweet_count = len(uid_be_retweet)
                if retweet_count < 100:
                    continue
                important_user_list.append(item["_id"])
                important_user_info.append([retweet_count, uid_count])

    print "finish: ", count
def get_extend(all_set):
    extend_result = set()
    retweet_comment_dict_list = []
    #step0: random get user
    user_count = len(all_set)
    all_user_list = list(all_set)
    if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list):
        silce = all_user_list
    else:
        silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE)
    db_number = get_db_num()
    #step1: get retweet
    retweet_index_name = retweet_index_name_pre + str(db_number)
    try:
        retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\
                body={'ids': silce})['docs']
    except:
        retweet_result = []
    #step1.2: get uid retweet
    for retweet_item in retweet_result:
        try:
            if retweet_item['found'] == True:
                uid_retweet_dict = retweet_item['_source']['uid_retweet']
                retweet_comment_dict_list.append(json.loads(uid_retweet_dict))
        except:
            pass
    #step2: get comment
    comment_index_name = comment_index_name_pre + str(db_number)
    try:
        comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\
                body={'ids': silce})['docs']
    except:
        comment_result = []
    #step2.2: get uid commnt
    for comment_item in comment_result:
        try:
            if comment_item['found'] == True:
                retweet_comment_dict_list.append(
                    json.loads(comment_item['_source']['uid_comment']))
        except:
            pass
    #step3: union dict list
    union_retweet_comment_list = union_dict(retweet_comment_dict_list)
    #step4: filter in user portrait
    extend_result = filter_out(union_retweet_comment_list.keys())
    return extend_result
def get_extend(all_set):
    extend_result = set()
    retweet_comment_dict_list = []
    #step0: random get user
    user_count = len(all_set)
    all_user_list = list(all_set)
    if RECOMMEND_IN_AUTO_RANDOM_SIZE > len(all_user_list):
        silce = all_user_list
    else:
        silce = random.sample(all_user_list, RECOMMEND_IN_AUTO_RANDOM_SIZE)
    db_number = get_db_num()
    #step1: get retweet
    retweet_index_name = retweet_index_name_pre + str(db_number)
    try:
        retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type,\
                body={'ids': silce})['docs']
    except:
        retweet_result = []
    #step1.2: get uid retweet
    for retweet_item in retweet_result:
        try:
            if retweet_item['found']==True:
                uid_retweet_dict = retweet_item['_source']['uid_retweet']
                retweet_comment_dict_list.append(json.loads(uid_retweet_dict))
        except:
            pass
    #step2: get comment
    comment_index_name = comment_index_name_pre + str(db_number)
    try:
        comment_result = es_comment.mget(indexd=comment_index_name, doc_type=comment_index_type,\
                body={'ids': silce})['docs']
    except:
        comment_result = []
    #step2.2: get uid commnt
    for comment_item in comment_result:
        try:
            if comment_item['found'] == True:
                retweet_comment_dict_list.append(json.loads(comment_item['_source']['uid_comment']))
        except:
            pass
    #step3: union dict list
    union_retweet_comment_list = union_dict(retweet_comment_dict_list)
    #step4: filter in user portrait
    extend_result = filter_out(union_retweet_comment_list.keys())
    return extend_result
def get_friends_list(recommend_set_list):

    friend_list = []
    if len(recommend_set_list) == 0:
        return friend_list
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #print 'db_number...',db_number
    search_result = es_retweet.mget(index=be_retweet_index_name_pre+str(db_number), doc_type=be_retweet_index_type, body={"ids": recommend_set_list})["docs"]
    for item in search_result:
        uid = item['_id']
        if not item['found']:
            continue
        else:
            data = item['_source']['uid_be_retweet']
            data = eval(data)
            friend_list.extend(data.keys())

    return friend_list[:500]
Exemple #9
0
def get_community_coreuser_socail(uid_list, timestamp):
    uid2uname = get_user_name(uid_list)
    result = {}
    #step1: get db number
    # timestamp = int(time.time())
    db_num = get_db_num(timestamp)
    retweet_index_name = retweet_index_name_pre + str(db_num)
    be_retweet_index_name = be_retweet_index_name_pre + str(db_num)
    comment_index_name = comment_index_name_pre + str(db_num)
    be_comment_index_name = be_comment_index_name_pre + str(db_num)
    #step2: split uid list to iter mget
    iter_count = 0
    all_user_count = len(uid_list)
    in_stat_results = dict()
    out_stat_result = dict()
    all_in_record = []
    all_out_record = []
    all_out_user_count = 0
    all_out_in_usr_count = 0
    while iter_count < all_user_count:
        # iter_uid_list = uid_list
        iter_uid_list = uid_list[iter_count:iter_count + GROUP_ITER_COUNT]
        #step3:mget retweet
        try:
            retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                             body={'ids':iter_uid_list})['docs']
        except:
            retweet_result = []
        retweet_dict = {}  #{uid1: {ruid1:count1, ruid2:count2}, uid2:{},...}
        for item in retweet_result:
            uid = item['_id']
            #tesit for error es
            try:
                if item['found'] == True:
                    retweet_dict[uid] = json.loads(
                        item['_source']['uid_retweet'])
            except:
                pass

        #step4:mget comment
        try:
            comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                             body={'ids':iter_uid_list})['docs']
        except:
            comment_result = []
        comment_dict = {}  #{uid1:{ruid1:count1, ruid2:count2},...}
        # print 'comment_result:',comment_result
        for item in comment_result:
            uid = item['_id']
            try:
                if item['found'] == True:
                    comment_dict[uid] = json.loads(
                        item['_source']['uid_comment'])
            except:
                pass
        #step5:mget be_retweet
        try:
            be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_index_type, \
                                                body={'ids':iter_uid_list})['docs']
        except:
            be_retweet_result = []
        be_retweet_dict = dict()  #{uid1: {uid_be_retweet dict}, uid2:{},...}
        for item in be_retweet_result:
            uid = item['_id']
            #test for error es
            try:
                if item['found'] == True:
                    be_retweet_dict[uid] = json.loads(
                        item['_source']['uid_be_retweet'])
            except:
                pass
        #step6:mget be_comment
        try:
            be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type,\
                                                body={'ids':iter_uid_list})['docs']
        except:
            be_comment_result = []
        be_comment_dict = dict()  #{uid1:{uid_be_comment dict}, uid2:{},...}
        for item in be_comment_result:
            uid = item['_id']
            #test for error es
            try:
                if item['found'] == True:
                    be_comment_dict[uid] = json.loads(
                        item['_source']['uid_be_comment'])
            except:
                pass
        #step7:union retweet&comment, be_retweet&be_comment
        for iter_uid in iter_uid_list:
            try:
                user_retweet_result = retweet_dict[iter_uid]
            except:
                user_retweet_result = {}
            try:
                user_comment_result = comment_dict[iter_uid]
            except:
                user_comment_result = {}
            filter_in_dict, filter_out_dict = filter_union_dict(
                [user_retweet_result, user_comment_result], uid_list, 'in&out')

            #step8: record the retweet/coment relaton in group uid

            #
            # uid_in_record = [[iter_uid, ruid, filter_in_dict[ruid], uid2uname[iter_uid], uid2uname[ruid]] for ruid in filter_in_dict if iter_uid != ruid]
            #
            # print 'filter_in_dict:',filter_in_dict
            # print 'filter_out_dict:',filter_out_dict
            # uid_in_record = [[iter_uid,uid2uname[iter_uid],ruid,uid2uname[ruid],filter_in_dict[ruid]] for ruid in filter_in_dict if iter_uid != ruid]
            uid_in_record = []
            for ruid in filter_in_dict:
                # print 'ruid:',ruid
                item_list = []
                if iter_uid != ruid:
                    # print 'aaaa'
                    item_list.append(iter_uid)

                    if uid2uname.has_key(iter_uid):
                        iter_name = uid2uname[iter_uid]
                    else:
                        iter_name = iter_uid
                    item_list.append(iter_name)

                    item_list.append(ruid)

                    if uid2uname.has_key(ruid):
                        ruid_name = uid2uname[ruid]
                    else:
                        ruid_name = ruid
                    item_list.append(ruid_name)
                    item_list.append(filter_in_dict[ruid])

                    if item_list:
                        uid_in_record.append(item_list)
                    else:
                        pass
                else:
                    pass
                # print 'item_list:',item_list
            # print 'uid_in_record:',uid_in_record
            # in_record = [uid_in_record]
            all_in_record.extend(
                uid_in_record
            )  # [[uid1, ruid1, count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...]
            #step9: record the retweet/comment/be_retweet/be_comment relation out group uid
            try:
                user_be_retweet_result = be_retweet_dict[iter_uid]
            except:
                user_be_retweet_result = {}
            try:
                user_be_comment_result = be_comment_dict[iter_uid]
            except:
                user_be_comment_result = {}
            filter_out_dict = filter_union_dict([
                filter_out_dict, user_be_retweet_result, user_be_comment_result
            ], uid_list, 'out')
            #step10: filter out user who is in user_portrait

            # uid_out_record = [[iter_uid,get_user_nickname(iter_uid),ruid,get_user_nickname(ruid),filter_out_dict[ruid]] for ruid in filter_out_dict if iter_uid != ruid]
            uid_out_record = []
            for ruid in filter_out_dict:
                item_list = []
                if iter_uid != ruid:
                    item_list.append(iter_uid)
                    if uid2uname.has_key(iter_uid):
                        iter_name = uid2uname[iter_uid]
                    else:
                        iter_name = iter_uid
                    item_list.append(iter_name)

                    item_list.append(ruid)

                    if uid2uname.has_key(ruid):
                        ruid_name = uid2uname[ruid]
                    else:
                        ruid_name = ruid
                    item_list.append(ruid_name)
                    item_list.append(filter_out_dict[ruid])

                    if item_list:
                        uid_out_record.append(item_list)
                    else:
                        pass
                else:
                    pass
            # out_record = [uid_out_record]
            all_out_record.extend(
                uid_out_record
            )  #[[uid1, ruid1,count1],[uid1,ruid2,count2],[uid2,ruid2,count3],...]
        iter_count += GROUP_ITER_COUNT

    #step11 sort interaction in group by retweet&comment count
    sort_in_record = sorted(all_in_record, key=lambda x: x[4], reverse=True)
    # print 'all_out_record::',all_out_record
    sort_out_record = sorted(all_out_record, key=lambda x: x[4], reverse=True)

    # print 'sort_in_record::',len(sort_in_record)
    # print 'sort_out_record::',len(sort_out_record)
    #core_uidlist,outer_uidlist,community_dict['core_user_socail'],community_dict['core_outer_socail']
    core_user_socail = [item for item in sort_in_record if item[4] > 2]
    core_uidlist = list(set([item[0] for item in core_user_socail]))
    # print 'core_uidlist:',core_uidlist
    # print 'core_user_socail:',core_user_socail
    core_outer_socail_temp = [
        item for item in sort_out_record
        if (len(list(set(item[2].split()) & set(core_uidlist))) > 0
            or len(list(set(item[0].split()) & set(core_uidlist))) > 0)
        and item[4] > 10
    ]
    # core_outer_socail = [item for item in sort_out_record if (len(list(set(item[2])&set(core_uidlist)))>0  or len(list(set(item[0].split())&set(core_uidlist)))>0)]
    core_outer_socail = sorted(core_outer_socail_temp,
                               key=lambda x: x[4],
                               reverse=True)[0:30]
    # print 'core_outer_socail::',core_outer_socail
    outer_uidlist = [item[0] for item in core_outer_socail]

    # print 'core_user_socail:',type(core_user_socail)
    # print 'core_outer_socail:',type(core_outer_socail)
    # print 'core_uidlist::',type(core_uidlist)
    # print 'outer_uidlist::',type(outer_uidlist)
    # return json.loads(core_uidlist),json.loads(outer_uidlist),json.loads(core_user_socail),json.loads(core_outer_socail)
    return core_uidlist, outer_uidlist, core_user_socail, core_outer_socail
Exemple #10
0
def detect_by_seed_users(seed_users):
    retweet_mark == 1
    comment_mark == 1

    group_uid_list = []
    all_union_result_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)

    #step1: mget retweet and be_retweet
    if retweet_mark == 1:
        retweet_index_name = retweet_index_name_pre + str(db_number)
        be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
        #mget retwet
        try:
            retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            retweet_result = []
        #mget be_retweet
        try:
            be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \
                                                body={'ids':seed_users} ,_source=True)['docs']
        except:
            be_retweet_result = []
    #step2: mget comment and be_comment
    if comment_mark == 1:
        comment_index_name = comment_index_name_pre + str(db_number)
        be_comment_index_name = be_comment_index_name_pre + str(db_number)
        #mget comment
        try:
            comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                             body={'ids':seed_users}, _source=True)['docs']
        except:
            comment_result = []
        #mget be_comment
        try:
            be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                            body={'ids':seed_users}, _source=True)['docs']
        except:
            be_comment_result = []
    
    #step3: union retweet/be_retweet/comment/be_comment result
    union_count = 0
    
    for iter_search_uid in seed_users:
        try:
            uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet'])
        except:
            uid_retweet_dict = {}
        try:
            uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet'])
        except:
            uid_be_retweet_dict = {}
        try:
            uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment'])
        except:
            uid_comment_dict = {}
        try:
            uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment'])
        except:
            uid_be_comment_dict = {}
        #union four type user set
        union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict)
        all_union_result_dict[iter_search_uid] = union_result

   
    '''
    !!!! 有一个转化提取 
    从 all_union_result_dict   中提取 所有的uid
    '''
    return group_uid_list
Exemple #11
0
def get_structure_user(seed_uid_list, structure_dict, filter_dict):
    structure_user_dict = {}
    retweet_mark = int(structure_dict['retweet'])
    comment_mark = int(structure_dict['comment'])
    hop = int(structure_dict['hop'])
    retweet_user_dict = {}
    comment_user_dict = {}
    #get retweet/comment es db_number
    now_ts = time.time()
    db_number = get_db_num(now_ts)
    #iter to find seed uid list retweet/be_retweet/comment/be_comment user set by hop
    iter_hop_user_list = seed_uid_list
    iter_count = 0
    all_union_result = dict()
    while iter_count < hop:   # hop number control
        iter_count += 1
        search_user_count = len(iter_hop_user_list)
        hop_union_result = dict()
        iter_search_count = 0
        while iter_search_count < search_user_count:
            iter_search_user_list = iter_hop_user_list[iter_search_count: iter_search_count + DETECT_ITER_COUNT]
            #step1: mget retweet and be_retweet
            if retweet_mark == 1:
                retweet_index_name = retweet_index_name_pre + str(db_number)
                be_retweet_index_name = be_retweet_index_name_pre + str(db_number)
                #mget retwet
                try:
                    retweet_result = es_retweet.mget(index=retweet_index_name, doc_type=retweet_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    retweet_result = []
                #mget be_retweet
                try:
                    be_retweet_result = es_retweet.mget(index=be_retweet_index_name, doc_type=be_retweet_type, \
                                                        body={'ids':iter_search_user_list} ,_source=True)['docs']
                except:
                    be_retweet_result = []
            #step2: mget comment and be_comment
            if comment_mark == 1:
                comment_index_name = comment_index_name_pre + str(db_number)
                be_comment_index_name = be_comment_index_name_pre + str(db_number)
                #mget comment
                try:
                    comment_result = es_comment.mget(index=comment_index_name, doc_type=comment_index_type, \
                                                     body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    comment_result = []
                #mget be_comment
                try:
                    be_comment_result = es_comment.mget(index=be_comment_index_name, doc_type=be_comment_index_type, \
                                                    body={'ids':iter_search_user_list}, _source=True)['docs']
                except:
                    be_comment_result = []
            #step3: union retweet/be_retweet/comment/be_comment result
            union_count = 0
            
            for iter_search_uid in iter_search_user_list:
                try:
                    uid_retweet_dict = json.loads(retweet_result[union_count]['_source']['uid_retweet'])
                except:
                    uid_retweet_dict = {}
                try:
                    uid_be_retweet_dict = json.loads(be_retweet_result[union_count]['_source']['uid_be_retweet'])
                except:
                    uid_be_retweet_dict = {}
                try:
                    uid_comment_dict = json.loads(comment_result[union_count]['_source']['uid_comment'])
                except:
                    uid_comment_dict = {}
                try:
                    uid_be_comment_dict = json.loads(be_comment_result[union_count]['_source']['uid_be_comment'])
                except:
                    uid_be_comment_dict = {}
                #union four type user set
                union_result = union_dict(uid_retweet_dict, uid_be_retweet_dict, uid_comment_dict, uid_be_comment_dict)
                hop_union_result = union_dict(hop_union_result, union_result)
            #step4: add iter search count
            iter_search_count += DETECT_ITER_COUNT

        #pop seed uid self
        for iter_hop_user_item in iter_hop_user_list:
            try:
                hop_union_result.pop(iter_hop_user_item)
            except:
                pass
        #get new iter_hop_user_list
        iter_hop_user_list = hop_union_result.keys()
        #get all union result
        all_union_result = union_dict(all_union_result, hop_union_result)
    #step5: identify the who is in user_portrait
    sort_all_union_result = sorted(all_union_result.items(), key=lambda x:x[1], reverse=True)
    iter_count = 0
    all_count = len(sort_all_union_result)
    in_portrait_result = []
    filter_importance_from = filter_dict['importance']['gte']
    filter_importance_to = filter_dict['importance']['lt']
    filter_influence_from = filter_dict['influence']['gte']
    filter_influence_to = filter_dict['influence']['lt']
    while iter_count < all_count:
        iter_user_list = [item[0] for item in sort_all_union_result[iter_count:iter_count + DETECT_ITER_COUNT]]
        try:
            portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \
                    body={'ids':iter_user_list}, _source=True)['docs']
        except:
            portrait_result = []
        for portrait_item in portrait_result:
            if portrait_item['found'] == True:
                if portrait_item['_source']['importance'] >= filter_importance_from and portrait_item['_source']['importance'] <= filter_importance_to:
                    if portrait_item['_source']['influence'] >= filter_influence_from and portrait_item['_source']['influence'] <= filter_influence_to:
                        uid = portrait_item['_id']
                        in_portrait_result.append(uid)
        if len(in_portrait_result) > (filter_dict['count'] * DETECT_COUNT_EXPAND):
            break
        iter_count += DETECT_ITER_COUNT

    return in_portrait_result
Exemple #12
0
def predict_user_influence(task_name, stop_time, ts):
    future_total = 0 # 未来传播总量
    current_total = 0 # 可控范围
    uid_count, in_user_list, in_user_info, all_user_dict = extend_network(task_name, ts)

    with open("gbdt.pkl", "r") as f:
        gbdt = pickle.load(f)

    # 已出现的重要用户阈值
    try:
        in_user_threshold = float(r_stimulation.get("in_user_threshold"))
    except:
        r_stimulation.set("in_user_threshold", 1000)
        in_user_threshold = 1000


    in_results = gbdt.predict(in_user_info)
    print "len(in_user_list): ", len(in_user_list)
    prediction_in = dict()
    for i in range(len(in_user_list)):
        if math.exp(in_results[i]) > in_user_threshold: # 1000
            prediction_in[in_user_list[i]] = math.exp(in_results[i])


    future_dict = dict()
    count = 0
    for k,v in all_user_dict.iteritems():
        uid = k
        print "k: ", k
        print "v: ", len(v)
        tmp_prediction_list = [] # tmp storage
        tmp_uid_list = []
        if 1:
            user_list = v
            list_len = len(user_list)
            len_1000 = list_len/1000
            for i in range(len_1000+1):
                tmp_uid = user_list[i*1000: (i+1)*1000]
                if not tmp_uid:
                    continue
                es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"]
                for item in es_results:
                    if item["found"]:
                        count += 1
                        uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"])
                        retweet_count = len(uid_be_retweet)
                        if retweet_count < 1000:
                            continue
                        tmp = []
                        tmp.append(math.log(retweet_count+1))
                        tmp.append(math.log(uid_count+1))
                        tmp_prediction_list.append(tmp)
                        tmp_uid_list.append(item["_id"])
                        if count % 1000 == 0:
                            iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict)
                            future_dict = iter_prediction_list
                            tmp_prediction_list = []
                            tmp_uid_list = []
                            future_total += t1
                            current_total += t2
                            print "iter prediction: ", count

        if tmp_prediction_list:
            iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict)
            future_dict = iter_prediction_list
            future_total += t1
            current_total += t2
            print "future_dict: ", future_dict

    # storage
    save_results(task_name, ts, prediction_in, future_dict)

    # do left things
    dispose_results(task_name, ts, future_total, current_total)


    # update processing state
    es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\
            id=task_name, body={"doc":{"stimulation_processing_status":"0", "update_time": ts, "scan_text_finish":"0"}})

    # stop task
    if ts >= stop_time:
        es_prediction.update(index=index_manage_interfere_task,doc_type=\
                type_manage_interfere_task,id=task_name,body={"doc":{"finish":"1"}})