def cal_sensitive_words_work(item, sw_list): timestamp = item['timestamp'] uid = item['uid'] timestamp = ts2datetime(timestamp).replace('-','') ts = timestamp map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))
def cal_sensitive_words_work(item, sw_list): timestamp = item['timestamp'] uid = item['uid'] timestamp = ts2datetime(timestamp).replace('-', '') ts = timestamp map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_' + str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(map))
def cal_text_work(item): uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) text = item['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: # there all use unicode· hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def cal_text_sensitive(item): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if isinstance(text, str): text = text.decode('utf-8', 'ignore') sensitive_result = [word for word in SENSITIVE_WORD if word in text] if sensitive_result: sensitive_dict = dict() for word in sensitive_result: try: sensitive_dict[word] += 1 except: sensitive_dict[word] = 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_dict: count = sensitive_dict[word] try: sensitive_count_dict[word] += count except: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
def save_activity(uid, ts, time_segment): key = str(ts) try: activity_count_dict = r_cluster.hget('activity_' + key, str(uid)) activity_count_dict = json.loads(activity_count_dict) try: activity_count_dict[str(time_segment)] += 1 except: activity_count_dict[str(time_segment)] = 1 r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict)) except: r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
def save_city(uid, ip, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-','') key = str(uid) try: if sensitive: ip_count_string = r_cluster.hget('sensitive_ip_'+str(ts), str(uid)) else: ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 if sensitive: r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) else: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) except: if sensitive: r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps({str(ip):1})) else: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
def save_at(uid, at_uid, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-', '') key = str(uid) try: if sensitive: ruid_count_string = r_cluster.hget('sensitive_at_' + str(ts), str(uid)) else: ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 if sensitive: r_cluster.hset('sensitive_at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) else: r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) except: if sensitive: r_cluster.hset('sensitive_at_' + str(ts), str(uid), json.dumps({str(at_uid): 1})) else: r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_city(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) except: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
def save_at(uid, at_uid, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) except: r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
def save_at(uid, at_uid, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) except: r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_city(uid, ip, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-', '') key = str(uid) try: if sensitive: ip_count_string = r_cluster.hget('sensitive_ip_' + str(ts), str(uid)) else: ip_count_string = r_cluster.hget('ip_' + str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 if sensitive: r_cluster.hset('sensitive_ip_' + str(ts), str(uid), json.dumps(ip_count_dict)) else: r_cluster.hset('ip_' + str(ts), str(uid), json.dumps(ip_count_dict)) except: if sensitive: r_cluster.hset('sensitive_ip_' + str(ts), str(uid), json.dumps({str(ip): 1})) else: r_cluster.hset('ip_' + str(ts), str(uid), json.dumps({str(ip): 1}))
def save_city_timestamp(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) try: ip_timestamp_string = r_cluster.hget('new_ip_'+str(ts), str(uid)) ip_timestamp_string_dict = json.loads(ip_timestamp_string) try: add_string = '&'+str(timestamp) ip_timestamp_string_dict[str(ip)] += add_string except: ip_timestamp_string_dict[str(ip)] = str(timestamp) r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps(ip_timestamp_string_dict)) except: r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def save_city_timestamp(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) try: ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid)) ip_timestamp_string_dict = json.loads(ip_timestamp_string) try: add_string = '&' + str(timestamp) ip_timestamp_string_dict[str(ip)] += add_string except: ip_timestamp_string_dict[str(ip)] = str(timestamp) r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps(ip_timestamp_string_dict)) except: r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def cal_hashtag_work(item, sensitive): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] ts = ts2datetime(timestamp).replace('-', '') if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: hashtag_dict = {} for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: if sensitive: hashtag_count_string = r_cluster.hget( 'sensitive_hashtag_' + str(ts), str(uid)) else: hashtag_count_string = r_cluster.hget('hashtag_' + str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count if sensitive: r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid), json.dumps(hashtag_count_dict)) else: r_cluster.hset('hashtag_' + str(ts), str(uid), json.dumps(hashtag_count_dict)) except: if sensitive: r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid), json.dumps(hashtag_dict)) else: r_cluster.hset('hashtag_' + str(ts), str(uid), json.dumps(hashtag_dict))
def cal_hashtag_work(item, sensitive): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] ts = ts2datetime(timestamp).replace('-','') if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: hashtag_dict = {} for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: if sensitive: hashtag_count_string = r_cluster.hget('sensitive_hashtag_'+str(ts), str(uid)) else: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count if sensitive: r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) else: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: if sensitive: r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict)) else: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def save_at(uid, at_uid, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-','') key = str(uid) try: if sensitive: ruid_count_string = r_cluster.hget('sensitive_at_'+str(ts), str(uid)) else: ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 if sensitive: r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) else: r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) except: if sensitive: r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps({str(at_uid):1})) else: r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
ts = datetime2ts(date) if sensitive_words_dict: #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] #should_index_name_date = ts2datetime(weibo_timestamp) # if should_index_name_date != now_index_name_date: if action != [] and xdata != []: #index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)
def cal_propage_work(item, sensitive_words): cluster_redis = R_CLUSTER_FLOW1 user = str(item['uid']) followers_count = item['user_fansnum'] friends_count = item.get("user_friendsnum", 0) cluster_redis.hset(user, 'user_fansnum', followers_count) cluster_redis.hset(user, 'user_friendsnum', friends_count) retweeted_uid = str(item['root_uid']) retweeted_mid = str(item['root_mid']) message_type = int(item['message_type']) mid = str(item['mid']) timestamp = item['timestamp'] text = item['text'] sw_list = searchWord(text.encode('utf-8')) sensitive_result = len(sw_list) if sensitive_result: date = ts2datetime(timestamp) ts = datetime2ts(date) map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') print word if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict)) if message_type == 1: cluster_redis.sadd('user_set', user) if sensitive_result: cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp) else: cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp) elif message_type == 2: # comment weibo cluster_redis.sadd('user_set', user) if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid): return #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) #nicknames = RE.findall(text) if not sensitive_result: cluster_redis.sadd(user + '_comment_weibo', retweeted_mid) queue_index = get_queue_index(timestamp) cluster_redis.hincrby(user, 'comment_weibo', 1) if 1: #if len(nicknames) == 0: cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp) """ else: nick_id_ = nicknames[0] _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id) if _id: cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp) """ else: cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid) queue_index = get_queue_index(timestamp) cluster_redis.hincrby('s_'+user, 'comment_weibo', 1) if 1: #if len(nicknames) == 0: cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp) """ else: nick_id_ = nicknames[0] _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id) if _id: cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp) """ elif message_type == 3: cluster_redis.sadd('user_set', user) if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid): return """ RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) nicknames = RE.findall(text) """ if not sensitive_result: cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid) cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) queue_index = get_queue_index(timestamp) cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1) cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) """ if len(nicknames) != 0: for nick_id in nicknames: _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id) if _id: cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp) cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1) """ else: cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid) cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) queue_index = get_queue_index(timestamp) cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1) cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) """
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e
item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[word] else: sensitive_count_dict[word] = sensitive_words_dict[word] r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date: if action != [] and xdata != []: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 now_index_name_date = should_index_name_date index_name = index_name_pre + now_index_name_date
def cal_propage_work(item, sensitive_words): cluster_redis = R_CLUSTER_FLOW1 user = str(item['uid']) uid = str(item['uid']) followers_count = item['user_fansnum'] friends_count = item.get("user_friendsnum", 0) cluster_redis.hset(user, 'user_fansnum', followers_count) cluster_redis.hset(user, 'user_friendsnum', friends_count) retweeted_uid = str(item['root_uid']) retweeted_mid = str(item['root_mid']) message_type = int(item['message_type']) mid = str(item['mid']) timestamp = item['timestamp'] text = item['text'] sw_list = searchWord(text.encode('utf-8')) sensitive_result = len(sw_list) if sensitive_result: ts = ts2datetime(timestamp).replace('-','') map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map)) if message_type == 1: cluster_redis.sadd('user_set', user) if sensitive_result: cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp) else: cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp) elif message_type == 2: # comment weibo cluster_redis.sadd('user_set', user) if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid): return #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) #nicknames = RE.findall(text) if not sensitive_result: cluster_redis.sadd(user + '_comment_weibo', retweeted_mid) queue_index = get_queue_index(timestamp) cluster_redis.hincrby(user, 'comment_weibo', 1) if 1: #if len(nicknames) == 0: cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp) """ else: nick_id_ = nicknames[0] _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id) if _id: cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp) """ else: cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid) queue_index = get_queue_index(timestamp) cluster_redis.hincrby('s_'+user, 'comment_weibo', 1) if 1: #if len(nicknames) == 0: cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp) """ else: nick_id_ = nicknames[0] _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id) if _id: cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1) cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp) """ elif message_type == 3: cluster_redis.sadd('user_set', user) if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid): return """ RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) nicknames = RE.findall(text) """ if not sensitive_result: cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid) cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) queue_index = get_queue_index(timestamp) cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1) cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) """ if len(nicknames) != 0: for nick_id in nicknames: _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id) print _id single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id) if _id: cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp) cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1) """ else: cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid) cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) queue_index = get_queue_index(timestamp) cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1) cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) """