def get_sensitive_user(timestamp, uid): score = 0 query_body = {'query': {'term': {'uid': uid}}, 'size': 50} index_name = flow_text_index_name_pre + ts2datetime(timestamp) search_results = es_flow_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score
def get_one_click_evaluation(task_detail): results = [] text = task_detail['text'].encode('utf-8') node = createWordTree() #print 'text...',text sensitive_words_dict = searchWord(text, node) #print 'sensitive_words_dict..',sensitive_words_dict if sensitive_words_dict: score = 0 sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) #print 'tmp_stage..',tmp_stage if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] sensitive_words_list.append(k.decode('utf-8')) results.append(score) results.append(sensitive_words_list) else: results = [0, []] return results
def compute_sensitive(text): score = 0 node = createWordTree() sensitive_words_dict = searchWord(text.encode('utf-8'), node) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score
def sensitive_check(text): DFA = createWordTree() item = {} count = 0 sensitive_words_dict = searchWord(text, DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) for i in sensitive_words_dict: count += sensitive_words_dict[i] return count, item
return action, xdata if __name__ == "__main__": """ receive weibo """ context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect('tcp://%s:%s' %(ZMQ_VENT_HOST_FLOW1, ZMQ_VENT_PORT_FLOW5)) controller = context.socket(zmq.SUB) controller.connect("tcp://%s:%s" %(ZMQ_VENT_HOST_FLOW1, ZMQ_CTRL_VENT_PORT_FLOW5)) DFA = createWordTree() count = 0 read_count = 0 tb = time.time() ts = tb bulk_action = [] now_date = ts2datetime(tb) index_name_pre = flow_text_index_name_pre index_type = flow_text_index_type now_index_name_date = ts2datetime(datetime2ts(start_date) - DAY) action = [] xdata = [] class_ts = time.time() while 1: item = receiver.recv_json()
def get_sensitive_user(timestamp, uid): score = 0 query_body = {'query': {'term': {'uid': uid}}, 'size': 50} index_name = flow_text_index_name_pre + ts2datetime(timestamp) try: search_results = es_xnr.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] except Exception, e: pass search_results = [] for result in search_results: text = result['_source']['text'].encode('utf-8') node = createWordTree() sensitive_words_dict = searchWord(text, node) if sensitive_words_dict: sensitive_words_list = [] for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: score += v * sensitive_score_dict[str(tmp_stage)] return score if __name__ == '__main__': # '2017-10-15' # print get_sensitive_user(timestamp=1507996800, uid='100003271864059') print get_sensitive_info(timestamp=1507996800, mid='123124323',
filter_keywords_set = set() for word in keywords_list: if word not in black_words: try: keywords_dict[word] += 1 except: keywords_dict[word] = 1 filter_keywords_set.add(word) keywords_string = '&'.join(list(filter_keywords_set)) return keywords_dict, keywords_string if __name__ == '__main__': DFA = createWordTree() count = 0 read_count = 0 bulk_action = [] action = [] xdata = [] tb = time.time() ts = tb index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type start_date = '2017-09-10' end_date = '2017-10-25' # end_date = '2017-09-11' start_ts = datetime2ts(start_date)
if __name__ == "__main__": """ receive weibo """ context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect('tcp://%s:%s' %(ZMQ_VENT_HOST_FLOW1, ZMQ_VENT_PORT_FLOW1)) controller = context.socket(zmq.SUB) controller.connect("tcp://%s:%s" %(ZMQ_VENT_HOST_FLOW1, ZMQ_CTRL_VENT_PORT_FLOW1)) cluster_redis = R_CLUSTER_FLOW1 sensitive_words = createWordTree() count = 0 tb = time.time() ts = tb while 1: item = receiver.recv_json() if not item: continue if int(item['sp_type']) == 1: cal_propage_work(item, sensitive_words) count += 1 if count % 10000 == 0:
""" context = zmq.Context() receiver = context.socket(zmq.PULL) receiver.connect('tcp://%s:%s' % (ZMQ_VENT_HOST_FLOW1, ZMQ_VENT_PORT_FLOW4)) controller = context.socket(zmq.SUB) controller.connect("tcp://%s:%s" % (ZMQ_VENT_HOST_FLOW1, ZMQ_CTRL_VENT_PORT_FLOW4)) count = 0 read_count = 0 tb = time.time() ts = tb sensitive_words = createWordTree() monitor_user_list = get_track_task_user() print 'monitor_user_list:', monitor_user_list update_user_ts = time.time() bulk_action = [] while 1: ''' use to update user list by 15min ''' update_user_te = time.time() if (update_user_te - update_user_ts) % 900 == 0: print 'update track user list' monitor_user_list = get_track_task_user() item = receiver.recv_json() #test
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e