def get_importance(uid, domain, topic): result = 0 domain_result = 0 domain_list = domain.split(' ') #print 'domain_list:', domain_list for domain in domain_list: try: domain_result += domain_weight_dict[domain] except: pass topic_result = 0 topic_list = topic.split(' ') #print 'topic_list:', topic_list for topic in topic_list: try: topic_result += topic_weight_dict[topic] except: pass #get fansnum, origin_weibo_retweeted_total_number, retweeted_weibo_retweeted_total_number now_ts = time.time() date = ts2datetime(now_ts-3600*24) #test date = '2013-09-07' index_time = ''.join(date.split('-')) index_type = 'bci' try: es_result = es.get(index=index_time, doc_type=index_type, id=uid)['_source'] fansnum = es_result['user_fansnum'] retweetednum = es_result['origin_weibo_retweeted_total_number'] + es_result['retweeted_weibo_retweeted_total_number'] result = importance_weight_dict['fansnum']*fansnum + importance_weight_dict['retweeted_num']*retweetednum + \ importance_weight_dict['domain']*domain_result + importance_weight_dict['topic']*topic_result #print 'importance result:', result return result except: return 0
def get_influence(uid): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts - 3600*24) # test #now_date = '2013-09-07' index_time = ''.join(now_date.split('-')) index_type = 'bci' try: result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index'] #print 'result_dict:', result ''' query_body = { 'query':{ 'filtered':{ 'query':{ 'match_all':{} }, 'filter':{ 'range':{ 'user_index':{ 'gte':result } } } } } } rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count'] #print 'rank:', rank ''' except: return 0 return result
def update_day_profile(uid_list): result_dict = dict() try: bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={'ids':uid_list}, fields=['user_fansnum', 'weibo_month_sum', 'user_friendsnum'])['docs'] except: bci_history_result = [] iter_count = 0 for uid in uid_list: try: bci_history_item = bci_history_result[iter_count] except: bci_history_item = {} if bci_history_item and bci_history_item['found']==True: if isinstance(bci_history_item['fields']['weibo_month_sum'][0], int): statusnum = bci_history_item['fields']['weibo_month_sum'][0] else: statusnum = 0 if isinstance(bci_history_item['fields']['user_fansnum'][0], int): fansnum = bci_history_item['fields']['user_fansnum'][0] else: fansnum = 0 if isinstance(bci_history_item['fields']['user_friendsnum'][0], int): friendsnum = bci_history_item['fields']['user_friendsnum'][0] else: friendsnum = 0 else: statusnum = 0 fansnum = 0 friendsnum = 0 result_dict[uid] = {'statusnum': statusnum, 'fansnum':fansnum, 'friendsnum': friendsnum} iter_count += 1 return result_dict
def search_sensitive_weibo(index_name): # sensitive weibo user recommend query_body = { "query": { "filtered": { "filter": { "bool": { "should": [{ "range": { "s_retweeted_weibo_number": { "gt": 0 } } }, { "range": { "s_origin_weibo_number": { "gt": 0 } } }] } } } }, "size": 10000000 } result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits'] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid
def get_influence(uid): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts - 3600*24) # test now_date = '2013-09-07' index_time = ''.join(now_date.split('-')) index_type = 'bci' try: result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index'] #print 'result_dict:', result ''' query_body = { 'query':{ 'filtered':{ 'query':{ 'match_all':{} }, 'filter':{ 'range':{ 'user_index':{ 'gte':result } } } } } } rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count'] #print 'rank:', rank ''' except: return 0 return result
def main(es): """ update all user in a day """ index_name = "20130903" #index_name = time.strftime("%Y%m%d", time.localtime(time.time()-86400)) bool = es.indices.exists(index=index_name) print bool if not bool: print "no index exist" sys.exit(0) user_rank = 0 bulk_action = [] n_range = range(0, 100000, 10000) tb = time.time() for left_range in n_range: result = search_rank(index_name, left_range, 10000) for item in result: update_info = {} user_rank += 1 update_info["user"] = item["_id"] update_info["rank"] = user_rank x = update_index_action(update_info, "rank", user_rank) bulk_action.extend((x[0], x[1])) if user_rank % 1000 == 0: while 1: try: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) bulk_action = [] break except Exception, r: es = ES_CLUSTER_FLOW1 print user_rank if user_rank % 10000 == 0: ts = time.time() print "%s : %s" % (user_rank, ts - tb) tb = ts
def main(es): """ update all user in a day """ index_name = "20130903" #index_name = time.strftime("%Y%m%d", time.localtime(time.time()-86400)) bool = es.indices.exists(index=index_name) print bool if not bool: print "no index exist" sys.exit(0) user_rank = 0 bulk_action = [] n_range = range(0,100000,10000) tb = time.time() for left_range in n_range: result = search_rank(index_name, left_range, 10000) for item in result: update_info = {} user_rank += 1 update_info["user"] = item["_id"] update_info["rank"] = user_rank x = update_index_action(update_info, "rank", user_rank) bulk_action.extend((x[0], x[1])) if user_rank % 1000 == 0: while 1: try: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) bulk_action = [] break except Exception, r: es = ES_CLUSTER_FLOW1 print user_rank if user_rank % 10000 == 0: ts = time.time() print "%s : %s" %(user_rank, ts - tb) tb = ts
def search_rank(index_name, start_point, size, index_type="bci"): query_body={ "query": { "match_all": {} }, "sort": [{"user_index": {"order": "desc"}}], "from": start_point, "size": size } result = es.search(index=index_name, doc_type=index_type, body=query_body, _source=False)['hits']['hits'] return result
def get_influence(uid_list): result = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date = ts2datetime(now_ts - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) index_time = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' try: es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list})['docs'] except Exception, e: raise e
def get_influence(uid_list): result = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date = ts2datetime(now_ts - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) index_time = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' try: es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list}, _source=False, fields=['user_index'])['docs'] except Exception, e: raise e
def search_from_es(date): index_time = 'bci_' + date.replace('-', '') index_type = 'bci' print index_time query_body = { 'query':{ 'match_all':{} }, 'size':k, 'sort':[{'user_index':{'order':'desc'}}] } result = es_bci.search(index=index_time, doc_type=index_type, body=query_body, _source=False, fields=['user_index'])['hits']['hits'] user_set = [] user_set = [user_dict['_id'] for user_dict in result] return set(user_set), result
def search_sensitive_weibo(index_name): query_body={ "query":{ "match_all":{} }, "sort":{'s_origin_weibo_comment_top_number':{"order": "desc"}}, "size":2000 } result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits'] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid
def search_top_k(index_name, top_k): # top_k recommend query_body={ "query":{ "match_all":{} }, "size":top_k, "sort": [{"user_index": {"order": "desc"}}] } result = es_cluster.search(index=index_name,doc_type="bci", body=query_body)["hits"]["hits"] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid
def get_evaluate_max(index_name): max_result = {} index_type = 'bci' evaluate_index = ['user_index'] for evaluate in evaluate_index: query_body = { 'query':{ 'match_all':{} }, 'size':1, 'sort':[{evaluate: {'order': 'desc'}}] } try: result = es_cluster.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] except Exception, e: raise e max_evaluate = result[0]['_source'][evaluate] max_result[evaluate] = max_evaluate
def search_rank(index_name, start_point, size, index_type="bci"): query_body = { "query": { "match_all": {} }, "sort": [{ "user_index": { "order": "desc" } }], "from": start_point, "size": size } result = es.search(index=index_name, doc_type=index_type, body=query_body, _source=False)['hits']['hits'] return result
def search_top_k(index_name, top_k): # top_k recommend query_body = { "query": { "match_all": {} }, "size": top_k, "sort": [{ "user_index": { "order": "desc" } }] } result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)["hits"]["hits"] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid
def search_from_es(date): index_time = 'bci_' + date.replace('-', '') index_type = 'bci' print index_time query_body = { 'query': { 'match_all': {} }, 'size': k, 'sort': [{ 'user_index': { 'order': 'desc' } }] } result = es_bci.search(index=index_time, doc_type=index_type, body=query_body, _source=False, fields=['user_index'])['hits']['hits'] user_set = [] user_set = [user_dict['_id'] for user_dict in result] return set(user_set), result
def search_sensitive_weibo(index_name): # sensitive weibo user recommend query_body={ "query":{ "filtered":{ "filter":{ "bool":{ "should":[ {"range":{"s_retweeted_weibo_number":{"gt":0}}}, {"range":{"s_origin_weibo_number":{"gt":0}}} ] } } } }, "size":10000000 } result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits'] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid
def get_attr_bci(uid_list): results = [] now_ts = time.time() now_date = ts2datetime(now_ts - 24*3600) ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-07') user_results = {} # {'uid':{'origin_max..':[], ''}} total_weibo_number = 0 fans_number = 0 origin_weibo_number = 0 retweeted_weibo_number = 0 origin_weibo_retweeted_total_number = 0 origin_weibo_comment_total_number = 0 retweeted_weibo_retweeted_total_number = 0 retweeted_weibo_comment_total_number = 0 origin_weibo_retweeted_top = 0 origin_weibo_comment_top = 0 retweeted_weibo_retweeted_top = 0 retweeted_weibo_comment_top = 0 influence_dict = {} for i in range(0, 7): timestamp = ts - i*24*3600 date = ts2datetime(timestamp) hash_key = ''.join(date.split('-')) es_user_results = es_cluster.mget(index=hash_key, doc_type='bci', body={'ids':uid_list})['docs'] for user_dict in es_user_results: try: user_item = user_dict['_source'] except: next uid = user_item['user'] total_weibo_number += user_item['origin_weibo_number'] total_weibo_number += user_item['retweeted_weibo_number'] # yuankun revise origin_weibo_number += user_item['origin_weibo_number'] retweeted_weibo_number += user_item['retweeted_weibo_number'] origin_weibo_retweeted_top += user_item['origin_weibo_retweeted_top_number'] origin_weibo_comment_top += user_item['origin_weibo_comment_top_number'] retweeted_weibo_retweeted_top += user_item['retweeted_weibo_retweeted_top_number'] retweeted_weibo_comment_top += user_item['retweeted_weibo_comment_top_number'] #print 'user_item:', user_item if uid in user_results: try: user_results[uid]['origin_weibo_retweeted_top'].append([user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]) user_results[uid]['origin_weibo_comment_top'].append([user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]) user_results[uid]['retweeted_weibo_retweeted_top'].append([user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]) user_results[uid]['retweeted_weibo_comment_top'].append([user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]) except: user_results[uid]['origin_weibo_retweeted_top'] = [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]] user_results[uid]['origin_weibo_comment_top'] = [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]] user_results[uid]['retweeted_weibo_retweeted_top'] = [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]] user_results[uid]['retweeted_weibo_comment_top'] = [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]] else: #print 'user_item:', [[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]] user_results[uid] = {'origin_weibo_retweeted_top':[[user_item['origin_weibo_retweeted_top_number'], user_item['origin_weibo_top_retweeted_id']]]} user_results[uid] = {'origin_weibo_comment_top': [[user_item['origin_weibo_comment_top_number'], user_item['origin_weibo_top_comment_id']]]} user_results[uid] = {'retweeted_weibo_retweeted_top': [[user_item['retweeted_weibo_retweeted_top_number'], user_item['retweeted_weibo_top_retweeted_id']]]} user_results[uid] = {'retweeted_weibo_comment_top': [[user_item['retweeted_weibo_comment_top_number'], user_item['retweeted_weibo_top_comment_id']]]} # yuankun need #print 'fan_num:', user_item['user_fansnum'], type(user_item['user_fansnum']), type(fans_number) fans_number += int(user_item['user_fansnum']) origin_weibo_retweeted_total_number += user_item['origin_weibo_retweeted_total_number'] origin_weibo_comment_total_number += user_item['origin_weibo_comment_total_number'] retweeted_weibo_retweeted_total_number += user_item['retweeted_weibo_retweeted_total_number'] retweeted_weibo_comment_total_number += user_item['retweeted_weibo_comment_total_number'] user_portrait_result = es.mget(index='user_portrait', doc_type='user', body={'ids':uid_list})['docs'] #print 'user_portrait_result:', user_portrait_result[0] # get activeness max & importance max & influence max to normalize evaluate_max_result = get_evaluate_max() for user_portrait in user_portrait_result: #print 'user_portrait:', user_portrait try: user_portrait_dict = user_portrait['_source'] #print 'user_portrait_dict:', user_portrait_dict uname = user_portrait_dict['uname'] importance = user_portrait_dict['importance'] normal_importance = math.log((importance / evaluate_max_result['importance']) * 9 + 1, 10) * 100 activeness = user_portrait_dict['activeness'] normal_activeness = math.log(activeness / evaluate_max_result['activeness'] * 9 + 1, 10) * 100 influence = user_portrait_dict['influence'] normal_influence = math.log(influence / evaluate_max_result['influence'] * 9 + 1, 10) * 100 except: uname = '' normal_importance = '' normal_activeness = '' normal_influence = '' #print 'user_portrait_dict:', user_portrait_dict uid = user_portrait_dict['uid'] user_item_dict = user_results[uid] origin_weibo_retweeted_top_item = sorted(user_item_dict['origin_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0] origin_weibo_comment_top_item = sorted(user_item_dict['origin_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0] retweeted_weibo_retweeted_top_item = sorted(user_item_dict['retweeted_weibo_retweeted_top'], key=lambda x:x[0], reverse=True)[0] retweeted_weibo_comment_top_item = sorted(user_item_dict['retweeted_weibo_comment_top'], key=lambda x:x[0], reverse=True)[0] results.append([uid, uname, normal_activeness, normal_importance, normal_influence, origin_weibo_retweeted_top_item ,\ origin_weibo_comment_top_item, retweeted_weibo_retweeted_top_item, \ retweeted_weibo_comment_top_item]) #yuankun need influence_dict['origin_weibo_retweeted_average_number'] = origin_weibo_retweeted_total_number/origin_weibo_number/7 influence_dict['origin_weibo_comment_average_number'] = origin_weibo_comment_total_number/origin_weibo_number/7 influence_dict['retweeted_weibo_retweeted_average_number'] = retweeted_weibo_retweeted_total_number/retweeted_weibo_number/7 influence_dict['retweeted_weibo_comment_average_number'] = retweeted_weibo_comment_total_number/retweeted_weibo_number/7 influence_dict['origin_weibo_retweeted_top_number'] = origin_weibo_retweeted_top/len(uid_list)/7 influence_dict['origin_weibo_comment_top_number'] = origin_weibo_comment_top/len(uid_list)/7 influence_dict['retweeted_weibo_retweeted_top_number'] = retweeted_weibo_retweeted_top/len(uid_list)/7 influence_dict['retweeted_weibo_comment_top_number'] = retweeted_weibo_comment_top/len(uid_list)/7 influence_dict['fans_number'] = fans_number influence_dict['total_weibo_number'] = total_weibo_number #print 'results:', results return {'user_influence_list': json.dumps(results), 'total_weibo_number': total_weibo_number}, influence_dict
result = es_cluster.search(index=index_name, doc_type="bci", body=query_body)['hits']['hits'] sensitive_uid = [] for item in result: sensitive_uid.append(item['_source']['uid']) return sensitive_uid if __name__ == '__main__': ''' f = open('sensitive_uid_list.txt', 'wb') uid_list = search_sensitive_weibo('20130904') for uid in uid_list: f.write(str(uid) + '\n') f.close() ''' f = open('sensitive_uid_list.txt', 'rb') for line in f: uid = line.strip() try: result = es_cluster.get(index='sensitive_user_portrait', doc_type='user', id=uid)['_source'] except: print uid continue if result['sensitive_words_string']: es.update(index='sensitive_user_portrait', doc_type='user', id=uid, body={"doc":{"type":1}}) else: es.update(index='sensitive_user_portrait', doc_type='user', id=uid, body={"doc":{"type":0}})
#write in version:15-12-08 #input: uid_list #output: {uid:influence, ...} def get_influence(uid_list): result = {} now_ts = time.time() #run_type if RUN_TYPE = 1: now_date = ts2datetime(now_ts - DAY) else: now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY) index_time = 'bci_' + ''.join(now_date.split('-')) index_type = 'bci' try: es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list})['docs'] except Exception, e: raise e for es_item in es_result: uid = es_item['_id'] if es_item['found'] == True: result[uid] = es_item['_source']['user_index'] else: result[uid] = 0 return result #use to get user importance #wirte in version:15-12-08 #input: domain, topic, user_fansnum, fansnum_max for one user
update_info["rank"] = user_rank x = update_index_action(update_info, "rank", user_rank) bulk_action.extend((x[0], x[1])) if user_rank % 1000 == 0: while 1: try: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) bulk_action = [] break except Exception, r: es = ES_CLUSTER_FLOW1 print user_rank if user_rank % 10000 == 0: ts = time.time() print "%s : %s" % (user_rank, ts - tb) tb = ts if bulk_action: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) print "finish !" if __name__ == "__main__": main(es)
sensitive_string = "sensitive_score_" + tmp_ts query_sensitive_body = { "query":{ "match_all":{} }, "size":1, "sort":{sensitive_string:{"order":"desc"}} } try: top_sensitive_result = es_bci_history.search(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body=query_sensitive_body, _source=False, fields=[sensitive_string])['hits']['hits'] top_sensitive = top_sensitive_result[0]['fields'][sensitive_string][0] except Exception, reason: print Exception, reason top_sensitive = 400 index_type = 'bci' user_bci_result = es_cluster.mget(index=index_name, doc_type=index_type, body={'ids':uid_list}, _source=True)['docs'] user_profile_result = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list}, _source=True)['docs'] bci_history_result = es_bci_history.mget(index=bci_history_index_name, doc_type=bci_history_index_type, body={"ids":uid_list}, fields=['user_fansnum', 'weibo_month_sum'])['docs'] sensitive_history_result = es_bci_history.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={'ids':uid_list}, fields=[sensitive_string], _source=False)['docs'] max_evaluate_influ = get_evaluate_max(index_name) for i in range(0, len(uid_list)): uid = uid_list[i] bci_dict = user_bci_result[i] profile_dict = user_profile_result[i] bci_history_dict = bci_history_result[i] sensitive_history_dict = sensitive_history_result[i] #print sensitive_history_dict try: bci_source = bci_dict['_source'] except: bci_source = None
for item in result: update_info = {} user_rank += 1 update_info["user"] = item["_id"] update_info["rank"] = user_rank x = update_index_action(update_info, "rank", user_rank) bulk_action.extend((x[0], x[1])) if user_rank % 1000 == 0: while 1: try: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) bulk_action = [] break except Exception, r: es = ES_CLUSTER_FLOW1 print user_rank if user_rank % 10000 == 0: ts = time.time() print "%s : %s" %(user_rank, ts - tb) tb = ts if bulk_action: es.bulk(bulk_action, index=index_name, doc_type="bci", timeout=30) print "finish !" if __name__ == "__main__": main(es)