def cal_text_work(item): uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) text = item['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: # there all use unicode· hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[word] else: sensitive_count_dict[word] = sensitive_words_dict[word] r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date: if action != [] and xdata != []: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 now_index_name_date = should_index_name_date index_name = index_name_pre + now_index_name_date
date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[ word] else: sensitive_count_dict[word] = sensitive_words_dict[ word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date: if action != [] and xdata != []: index_name = index_name_pre + now_index_name_date if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60)