def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
Ejemplo n.º 2
0
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
            count += 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':json.dumps(user_hashtag_dict)}
    return all_results
Ejemplo n.º 3
0
def scan_offline_task():

    query = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "status": 0
                    }
                }]
            }
        },
        "size": 1000
    }
    results = es_user_portrait.search(index=USER_RANK_KEYWORD_TASK_INDEX,
                                      doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                      body=query)['hits']['hits']
    if results:
        for item in results:
            task_id = item['_id']
            iter_item = item['_source']
            search_type = iter_item['search_type']
            pre = iter_item['pre']
            during = iter_item['during']
            start_time = iter_item['start_time']
            keyword = json.loads(iter_item['keyword'])
            search_key = iter_item['user_ts']
            number = iter_item['number']
            sort_norm = iter_item['sort_norm']
            sort_scope = iter_item['sort_scope']
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush(
                "task_user_rank",
                json.dumps([
                    task_id, search_type, pre, during, start_time, keyword,
                    search_key, sort_norm, sort_scope, time, isall, number
                ]))
            iter_item['status'] = -1
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                                   doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                   id=task_id,
                                   body=iter_item)
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    today_sensitive_dict = {}
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        #print 'sensitive_results:', sensitive_results
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            sensitive_item = sensitive_results[count]
            if uid not in today_sensitive_dict:
                today_sensitive_dict[uid] = {}
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_dict[uid][sensitive] += 1
                    except:
                        today_sensitive_dict[uid][sensitive] = 1
            count += 1
    #print 'results:', results
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        #print 'uid,sensitive:', uid, user_sensitive_dict
        sensitive_score = 0
        today_sensitive_dict_user = today_sensitive_dict[uid]
        for item in today_sensitive_dict_user:
            k = item
            v = today_sensitive_dict_user[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
            #print 'sensitive_score:', sensitive_score
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        #print 'uid, sensitive:', uid, sensitive_string, sensitive_score
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\
                'sensitive': sensitive_score}
    #print 'all_results:', all_results
    return all_results
Ejemplo n.º 5
0
def update_day_sensitive(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    today_sensitive_dict = {}
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        count = 0
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        #print 'sensitive_results:', sensitive_results
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            sensitive_item = sensitive_results[count]
            if uid not in today_sensitive_dict:
                today_sensitive_dict[uid] = {}
            if sensitive_item:
                sensitive_dict = json.loads(sensitive_item)
            else:
                sensitive_dict = {}
            for sensitive in sensitive_dict:
                try:
                    results[uid][sensitive] += 1
                except:
                    results[uid][sensitive] = 1
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_dict[uid][sensitive] += 1
                    except:
                        today_sensitive_dict[uid][sensitive] = 1
            count += 1
    #print 'results:', results
    for uid in uid_list:
        user_sensitive_dict = results[uid]
        #print 'uid,sensitive:', uid, user_sensitive_dict
        sensitive_score = 0
        today_sensitive_dict_user = today_sensitive_dict[uid]
        for item in today_sensitive_dict_user:
            k = item
            v = today_sensitive_dict_user[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
            #print 'sensitive_score:', sensitive_score
        sensitive_string = '&'.join(user_sensitive_dict.keys())
        #print 'uid, sensitive:', uid, sensitive_string, sensitive_score
        all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':json.dumps(user_sensitive_dict),\
                'sensitive': sensitive_score}
    #print 'all_results:', all_results
    return all_results
Ejemplo n.º 6
0
def scan_offline_task():
    
    query = {"query":{"bool":{"must":[{"term":{"status":0}}]}},"size":1000}
    results = es_user_portrait.search(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type = USER_RANK_KEYWORD_TASK_TYPE,body=query)['hits']['hits']
    if results :
        for item in results:
            task_id = item['_id']
            iter_item = item['_source']
            search_type = iter_item['search_type']          
            pre = iter_item['pre']
            during =  iter_item['during'] 
            start_time =  iter_item['start_time']  
            keyword = json.loads(iter_item['keyword'])
            search_key = iter_item['user_ts']
            number = iter_item['number']
            sort_norm = iter_item['sort_norm']
            sort_scope = iter_item['sort_scope']
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope  ,time , isall, number]))
            iter_item['status'] = -1 
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)
Ejemplo n.º 7
0
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
            count += 1
    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = '&'.join(user_hashtag_dict.keys())
        all_results[uid] = {
            'hashtag': hashtag_string,
            'hashtag_dict': json.dumps(user_hashtag_dict)
        }
    return all_results
Ejemplo n.º 8
0
                ]))
            iter_item['status'] = -1
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                                   doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                   id=task_id,
                                   body=iter_item)


def cron_task(data):
    key_words_search(data[0], data[1], data[2], data[3], data[4], data[5],
                     data[6], data[7], data[8], data[9], data[10], data[11])


if __name__ == "__main__":

    scan_offline_task()
    while 1:
        data = redis_task.rpop("task_user_rank")
        print data
        #"""
        if data:
            try:
                cron_task(json.loads(data))
            except Exception, e:
                print e, '&error&', ts2date(time.time())
        else:
            break
        #"""
Ejemplo n.º 9
0
uidlist = []
f = open("uid_list_0520.txt")
for line in f:
    uid = line.strip()
    uidlist.append(uid)
f.close()

data = []
dates = [
    "2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18",
    "2016-05-19", "2016-05-20"
]
tss = [datetime2ts(d) for d in dates]
for ts in tss:
    ns = "hashtag_" + str(ts)
    hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist)
    hashtag_list = [json.loads(h) if h else None for h in hashtag_list]
    uhlist = zip(uidlist, hashtag_list)
    uhtlist = []
    for uh in uhlist:
        uh = list(uh)
        uh.append(ts)
        uhtlist.append(uh)
    data.extend(uhtlist)

with open("hashtag_0521.txt", "w") as fw:
    for d in data:
        if d[1] != None:
            fw.write("%s\n" % json.dumps(d))

at_data = []
Ejemplo n.º 10
0
            time = iter_item['time']
            isall = iter_item['isall']
            redis_task.lpush("task_user_rank", json.dumps([task_id, search_type , pre , during , start_time , keyword , search_key , sort_norm , sort_scope  ,time , isall, number]))
            iter_item['status'] = -1 
            task_id = item['_id']
            #print item
            es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id, body=iter_item)


def cron_task(data):
    key_words_search(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11])
    

if __name__ == "__main__":

    scan_offline_task()
    while 1:
        data = redis_task.rpop("task_user_rank")
        print data
        #"""
        if data:
            try:
                cron_task(json.loads(data))
            except Exception, e:
                print e, '&error&', ts2date(time.time())
        else:
            break
        #"""
            
    
Ejemplo n.º 11
0
         statusnum = 0
 else:
     uname = uid
     location = ''
     try:
         fansnum = bci_history_dict['fields']['user_fansnum'][0]
     except:
         fansnum = 0
     try:
         statusnum = bci_history_dict['fields']['weibo_month_sum'][0]
     except:
         statusnum = 0
 if status == 'show_in':
     if user_type == "sensitive":
         tmp_ts = datetime2ts(date) - DAY
         tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
         if tmp_data:
             sensitive_dict = json.loads(tmp_data)
             sensitive_words = sensitive_dict.keys()
         else:
             sensitive_words = []
         if sensitive_history_dict.get('fields',0):
             #print sensitive_history_dict['fields'][sensitive_string][0]
             #print top_sensitive
             sensitive_value = math.log(sensitive_history_dict['fields'][sensitive_string][0]/float(top_sensitive)*9+1, 10)*100
             #print "sensitive_value", sensitive_value
         else:
             sensitive_value = 0
         results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words, sensitive_value])
     else:
         results.append([uid, uname, location, fansnum, statusnum, influence])
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        now_ts = datetime2ts('2016-03-27')
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number
    """
    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count


    #######
    #######更新尚未完成的用户
    update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []
    """
    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
		        revise_item.remove(del_sensitive_key)
            uid = revise_item['uid']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
		    # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
		    # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get('sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get('sensitive_month_ave', 0)
		    # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

            action = {'index':{'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
                print iter_count
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
        except Exception, r:
            print Exception, r

        if bulk_action:
            es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
Ejemplo n.º 13
0
            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'),
                                              DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(
                    sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget(
                    'sensitive_' + str(ts), str(uid))
                if sensitive_count_string:  #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[
                                word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[
                                word]
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_words_dict))
Ejemplo n.º 14
0
def main():
    if RUN_TYPE:
        now_ts = time.time() - DAY  # 前一天
        now_ts = datetime2ts('2016-03-27')
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts  # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts + "_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_" + str(del_month)  # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number
    """
    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count


    #######
    #######更新尚未完成的用户
    update_scan = scan(es, query={"query":{"filtered":{"filter":{"missing":{"field":update_sensitive_key}}}}}, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
    iter_count = 0
    bulk_action = []
    """
    while 1:
        try:
            tmp = update_scan.next()
            revise_item = tmp['_source']
            if del_sensitive_key in revise_item:
                revise_item.remove(del_sensitive_key)
            uid = revise_item['uid']
            # 新更新的敏感度
            revise_item[update_sensitive_key] = 0
            # 新更新的敏感词
            revise_item[sensitive_dict_key] = json.dumps({})
            # 新更新的string
            revise_item[sensitive_string_key] = ""
            # 当天和之前一天、一周和一月均值的差异
            revise_item['sensitive_day_change'] = 0 - revise_item.get(
                former_sensitive_key, 0)
            revise_item['sensitive_week_change'] = 0 - revise_item.get(
                'sensitive_week_ave', 0)
            revise_item['sensitive_month_change'] = 0 - revise_item.get(
                'sensitive_month_ave', 0)
            # 更新后week、month的均值和方差
            revise_item['sensitive_week_ave'], revise_item[
                'sensitive_week_var'], revise_item[
                    'sensitive_week_sum'] = compute_week(revise_item, now_ts)
            revise_item['senstiive_month_ave'], revise_item[
                'sensitive_month_var'], revise_item[
                    'sensitive_month_sum'] = compute_month(
                        revise_item, now_ts)

            action = {'index': {'_id': uid}}
            bulk_action.extend([action, revise_item])
            iter_count += 1
            if iter_count % 1000 == 0:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
                bulk_action = []
                print iter_count
        except StopIteration:
            print "all done"
            if bulk_action:
                es.bulk(bulk_action,
                        index=ES_SENSITIVE_INDEX,
                        doc_type=DOCTYPE_SENSITIVE_INDEX)
        except Exception, r:
            print Exception, r

        if bulk_action:
            es.bulk(bulk_action,
                    index=ES_SENSITIVE_INDEX,
                    doc_type=DOCTYPE_SENSITIVE_INDEX)
Ejemplo n.º 15
0
def get_flow_information(uid_list):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #test
    now_date_ts = test_ts
    for i in range(7,0,-1):
        ts = now_date_ts - DAY*i
        iter_date = ts2datetime(ts)
        flow_text_index_name = flow_text_index_name_pre + iter_date
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo = ip2city(ip)
                if geo:
                    #print 'geo:', geo
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
        
        #compute keywords:        
        try:
            text_results = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \
                                               body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size':MAX_VALUE}, _source=True, fields=['uid', 'keywords_dict'])['hits']['hits']
        except:
            text_results = {}
        for item in text_results:
            #print 'keywords item:', item
            uid = item['fields']['uid'][0]
            uid_keywords_dict = json.loads(item['fields']['keywords_dict'][0])
            for keywords in uid_keywords_dict:
                try:
                    iter_results[uid]['keywords'][keywords] += uid_keywords_dict[keywords]
                except:
                    iter_results[uid]['keywords'][keywords] = uid_keywords_dict[keywords]

       
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        for item in sensitive_word_dict:
            k = item
            v = sensitive_word_dict[k]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        #print 'geo_dict_keys:', geo_dict_keys
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        #print 'activity_geo:',  results[uid]['activity_geo']

        keywords_dict = iter_results[uid]['keywords']
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        
    return results
Ejemplo n.º 16
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}      
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {} # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_'+str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list)
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0 
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {'hashtag':{}, 'geo':{},'geo_track':[],'keywords':{}, 'sensitive':{}, 'school':{}}
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
               
    #get keywords top
    for uid in uid_list:
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #print 'sensitive_dict:', results[uid]['sensitive_dict']
        #print 'sensitive_string:', results[uid]['sensitive_string']
        #print 'sensitive:', results[uid]['sensitive']
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join([item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        
    return results
            item['keywords_string'] = keywords_string         # use to search

            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
                if sensitive_count_string: #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[word]
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
Ejemplo n.º 18
0
from time_utils import datetime2ts,ts2datetime
from global_utils import R_CLUSTER_FLOW3, R_CLUSTER_FLOW2, R_CLUSTER_FLOW1

uidlist = []
f = open("uid_list_0520.txt")
for line in f:
    uid = line.strip()
    uidlist.append(uid)
f.close()

data = []
dates = ["2016-05-14", "2016-05-15", "2016-05-16", "2016-05-17", "2016-05-18", "2016-05-19", "2016-05-20"]
tss = [datetime2ts(d) for d in dates]
for ts in tss:
   ns = "hashtag_" + str(ts)
   hashtag_list = R_CLUSTER_FLOW3.hmget(ns, uidlist)
   hashtag_list = [json.loads(h) if h else None for h in hashtag_list]
   uhlist = zip(uidlist, hashtag_list)
   uhtlist = []
   for uh in uhlist:
       uh = list(uh)
       uh.append(ts)
       uhtlist.append(uh)
   data.extend(uhtlist)

with open("hashtag_0521.txt", "w") as fw:
    for d in data:
        if d[1] != None:
            fw.write("%s\n" % json.dumps(d))

at_data = []