Esempi in Python per searchWord, esempi in Python per DFA_filter.searchWord

Esempio n. 1

0

Mostra file

File: zmq_work_weibo_flow2.py Progetto: huxiaoqian/sensitive_user_portrait

def cal_propage_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    text = item['text']
    sw_list = searchWord(text.encode('utf-8'))
    sensitive = len(sw_list)

    #ip = item['geo']
    ip = item['send_ip']
    # attribute location
    if ip:
        save_city(uid, ip, timestamp, sensitive)

    # attribute activity
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    date = date.replace('-','')
    time_segment = (timestamp - ts) / Fifteenminutes
    save_activity(uid, date, time_segment, sensitive)

    # attribute mention
    at_uname_list = extract_uname(text)
    try:
        at_uname = at_uname_list[0]
        save_at(uid, at_uname, timestamp, sensitive)
    except:
        pass

Esempio n. 2

0

Mostra file

File: utils.py Progetto: feifanhanmc/xnr2

def get_one_click_evaluation(task_detail):

    results = []
    text = task_detail['text'].encode('utf-8')

    node = createWordTree()
    #print 'text...',text
    sensitive_words_dict = searchWord(text, node)
    #print 'sensitive_words_dict..',sensitive_words_dict
    if sensitive_words_dict:
        score = 0
        sensitive_words_list = []

        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            #print 'tmp_stage..',tmp_stage
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
            sensitive_words_list.append(k.decode('utf-8'))
        results.append(score)
        results.append(sensitive_words_list)

    else:
        results = [0, []]

    return results

Esempio n. 3

0

Mostra file

def sensitive_process(text,timestamp):

    ## 人物敏感度
    iter_results = {} # iter_results = {uid:{}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if S_TYPE != 'test':
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(S_DATE)

    for i in range(WEEK,0,-1):
        ts = now_date_ts - DAY*i
        sensitive_results = r_cluster_3.hmget('sensitive_'+str(ts), uid_list)
        count = 0
        for uid in uid_list:
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}

            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][sensitive_word] += uid_sensitive_dict[sensitive_word]
                    except:
                        today_sensitive_results[uid][sensitive_word] = uid_sensitive_dict[sensitive_word]

        for uid in uid_list:
        results[uid] = {}

    

    ## 信息敏感度
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    if sensitive_words_dict:
        item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
        item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    else:
        item['sensitive_words_string'] = ""
        item['sensitive_words_dict'] = json.dumps({})
                    
    sensitive_words_dict = json.loads(item['sensitive_words_dict'])
    if sensitive_words_dict:
        score = 0
        for k,v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v*sensitive_score_dict[str(tmp_stage)]
        index_body['sensitive'] = score

Esempio n. 4

0

Mostra file

File: get_sensitive.py Progetto: yuanhuiru/xnr2

def compute_sensitive(text):
    score = 0
    node = createWordTree()
    sensitive_words_dict = searchWord(text.encode('utf-8'), node)
    if sensitive_words_dict:
        for k, v in sensitive_words_dict.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", k)
            if tmp_stage:
                score += v * sensitive_score_dict[str(tmp_stage)]
    return score

Esempio n. 5

0

Mostra file

File: sensitive_compute.py Progetto: yuanhr/xnr1

def sensitive_check(text):
    DFA = createWordTree()
    item = {}
    count = 0
    sensitive_words_dict = searchWord(text, DFA)
    if sensitive_words_dict:
        item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
        item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    else:
        item['sensitive_words_string'] = ""
        item['sensitive_words_dict'] = json.dumps({})
    for i in sensitive_words_dict:
        count += sensitive_words_dict[i]
    return count, item

Esempio n. 6

0

Mostra file

File: zmq_work_weibo_flow3.py Progetto: jianjian0dandan/sensitive_user_portrait

def comment_uname2uid(item):
    direct_uid = None
    uid = item['uid']
    root_uid = item['root_uid']
    timestamp = item['timestamp']
    text = item['text']
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    sensitive = len(sensitive_words_dict)
    direct_uid = ''
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'回复@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
    comment_chains = RE.findall(text)
    if comment_chains != []:
        direct_uname = comment_chains[0]
        direct_uid = uname2uid(direct_uname)
    if direct_uid == '':
        direct_uid = root_uid
    
    save_comment(uid, direct_uid, timestamp, sensitive)

Esempio n. 7

0

Mostra file

def get_sensitive_user(timestamp, uid):
    
    score = 0

    query_body = {
        'query':{
            'term':{'uid':uid}
        },
        'size':50
    }
    
    index_name = flow_text_index_name_pre + ts2datetime(timestamp)

    search_results = es_flow_text.search(index=index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']

    for result in search_results:

        text = result['_source']['text'].encode('utf-8')

        node = createWordTree();
        sensitive_words_dict = searchWord(text, node)

        if sensitive_words_dict:
            
            sensitive_words_list = []

            for k,v in sensitive_words_dict.iteritems():
                tmp_stage = r_sensitive.hget("sensitive_words", k)
                if tmp_stage:
                    score += v*sensitive_score_dict[str(tmp_stage)]
    print '\n'    
    print '\n'    
    print '\n'    
    print '\n'
    print '\n'    
    print 'score=============',score    
    print '\n'    
    print '\n'    
    print '\n'    
    print '\n'    
    return score

Esempio n. 8

0

Mostra file

File: zmq_work_weibo_flow3.py Progetto: ibs9668/sensitive_user_portrait

def comment_uname2uid(item):
    direct_uid = None
    uid = item['uid']
    root_uid = item['root_uid']
    timestamp = item['timestamp']
    text = item['text']
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    sensitive = len(sensitive_words_dict)
    direct_uid = ''
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'回复@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):',
                    re.UNICODE)
    comment_chains = RE.findall(text)
    if comment_chains != []:
        direct_uname = comment_chains[0]
        direct_uid = uname2uid(direct_uname)
    if direct_uid == '':
        direct_uid = root_uid

    save_comment(uid, direct_uid, timestamp, sensitive)

Esempio n. 9

0

Mostra file

def cal_propage_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    text = item['text']
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    sensitive = len(sensitive_words_dict)

    #if sensitive:
    #    r.sadd('sensitive_user', uid) # 敏感微博用户集合

    #ip = item['geo']
    ip = item['send_ip']
    # attribute location
    if ip:
        save_city(uid, ip, timestamp, sensitive)

    # attribute activity
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    time_segment = (timestamp - ts) / Fifteenminutes
    save_activity(uid, timestamp, time_segment, sensitive)

    # attribute mention
    at_uname_list = extract_uname(text)
    try:
        if at_uname_list:
            at_uname = at_uname_list[0]
            if at_uname != '':
                save_at(uid, at_uname, timestamp, sensitive)
    except:
        pass

    # hashtag
    hashtag_list = extract_hashtag(text)
    if hashtag_list:
        cal_hashtag_work(uid, hashtag_list, timestamp, sensitive)

Esempio n. 10

0

Mostra file

File: zmq_work_weibo_flow4.py Progetto: yuwendong/sensitive_user_portrait

def cal_propage_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    text = item['text']
    sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
    sensitive = len(sensitive_words_dict)

    #if sensitive:
    #    r.sadd('sensitive_user', uid) # 敏感微博用户集合

    #ip = item['geo']
    ip = item['send_ip']
    # attribute location
    if ip:
        save_city(uid, ip, timestamp, sensitive)

    """
    # attribute activity
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    time_segment = (timestamp - ts) / Fifteenminutes
    save_activity(uid, timestamp, time_segment, sensitive)

    # attribute mention
    at_uname_list = extract_uname(text)
    try:
        at_uname = at_uname_list[0]
        save_at(uid, at_uname, timestamp, sensitive)
    except:
        pass
    """

    # hashtag
    hashtag_list = extract_hashtag(text)
    if hashtag_list:
        cal_hashtag_work(uid, hashtag_list, timestamp, sensitive)

Esempio n. 11

0

Mostra file

File: fb_tw_flow_text_process.py Progetto: yuanhuiru/xnr2

def test(ft_type):
    print ft_type
    if ft_type == 'facebook':
        index_name_pre = facebook_flow_text_index_name_pre
        index_type = facebook_flow_text_index_type
        user_index_name = facebook_user_index_name
        user_index_type = facebook_user_index_type
    else:
        index_name_pre = twitter_flow_text_index_name_pre
        index_type = twitter_flow_text_index_type
        user_index_name = twitter_user_index_name
        user_index_type = twitter_user_index_type

    # date_list = load_date_list(True)
    date_list = load_date_list()

    DFA = createWordTree()
    query_body = {
        'post_filter': {
            'missing': {
                'field': 'keywords_string'
            }
        },
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': [{
                            'range': {
                                'flag_ch': {
                                    'gte': -1
                                }
                            }
                        }]
                    }
                }
            }
        }
    }
    for date in date_list:
        count = 0
        bulk_action = []
        index_name = index_name_pre + date
        try:
            es_scan_results = scan(es,
                                   query=query_body,
                                   size=1000,
                                   index=index_name,
                                   doc_type=index_type)
            while True:
                try:
                    scan_data = es_scan_results.next()
                    item = scan_data['_source']
                    text = item['text_ch']
                    uid = item['uid']
                    if ft_type == 'facebook':
                        _id = item['fid']
                    else:
                        _id = item['tid']

                    ts = datetime2ts(date)
                    #add sentiment field to weibo

                    sentiment, keywords_list = triple_classifier(item)

                    #add key words to weibo
                    keywords_dict, keywords_string = get_weibo_keywords(
                        keywords_list)

                    #sensitive_words_dict
                    sensitive_words_dict = searchWord(
                        text.encode('utf-8', 'ignore'), DFA)
                    if sensitive_words_dict:
                        sensitive_words_string_data = "&".join(
                            sensitive_words_dict.keys())
                        sensitive_words_dict_data = json.dumps(
                            sensitive_words_dict)
                    else:
                        sensitive_words_string_data = ""
                        sensitive_words_dict_data = json.dumps({})

                    #redis
                    if sensitive_words_dict:
                        sensitive_count_string = r_cluster.hget(
                            'sensitive_' + str(ts), str(uid))
                        if sensitive_count_string:  #redis取空
                            sensitive_count_dict = json.loads(
                                sensitive_count_string)
                            for word in sensitive_words_dict.keys():
                                if sensitive_count_dict.has_key(word):
                                    sensitive_count_dict[
                                        word] += sensitive_words_dict[word]
                                else:
                                    sensitive_count_dict[
                                        word] = sensitive_words_dict[word]
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_count_dict))
                        else:
                            r_cluster.hset('sensitive_' + str(ts), str(uid),
                                           json.dumps(sensitive_words_dict))

                    #sensitive
                    sensitive_score = 0
                    if sensitive_words_dict:
                        for k, v in sensitive_words_dict.iteritems():
                            tmp_stage = r_sensitive.hget("sensitive_words", k)
                            if tmp_stage:
                                sensitive_score += v * sensitive_score_dict[
                                    str(tmp_stage)]

                    #directed_uid
                    directed_uid_data = 0
                    directed_uid, directed_uname = get_root_retweet(
                        text, uid, ft_type)
                    if directed_uid:
                        directed_uid_data = long(directed_uid)

                    # hashtag
                    hashtag = ''
                    RE = re.compile(
                        u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。： :、]'
                    )
                    hashtag_list = re.findall(RE, text)
                    if hashtag_list:
                        hashtag = '&'.join(hashtag_list)

                    #action
                    action = {'update': {'_id': _id}}

                    # action_data
                    action_data = {
                        'sentiment': str(sentiment),
                        'keywords_dict': json.dumps(keywords_dict),
                        'keywords_string': keywords_string,
                        'sensitive_words_string': sensitive_words_string_data,
                        'sensitive_words_dict': sensitive_words_dict_data,
                        'sensitive': sensitive_score,
                        'directed_uid': directed_uid_data,
                        'directed_uname': directed_uname,
                        'hashtag': hashtag,
                    }

                    bulk_action.extend([action, {'doc': action_data}])
                    count += 1

                    if count % 1000 == 0 and count != 0:
                        if bulk_action:
                            es.bulk(bulk_action,
                                    index=index_name,
                                    doc_type=facebook_flow_text_index_type,
                                    timeout=600)
                        bulk_action = []
                        count = 0
                except StopIteration:
                    break
            if bulk_action:

                es.bulk(bulk_action,
                        index=index_name,
                        doc_type=facebook_flow_text_index_type,
                        timeout=600)
        except Exception, e:  #es文档不存在
            print e

Esempio n. 12

0

Mostra file

File: zmq_work_weibo_flow5.py Progetto: huxiaoqian/user_portrait_ending2

            continue 

        if int(item['sp_type']) == 1:
            read_count += 1
            text = item['text']
            uid = item['uid']

            #add sentiment field to weibo
            sentiment, keywords_list  = triple_classifier(item)
            item['sentiment'] = str(sentiment)
            #add key words to weibo
            keywords_dict, keywords_string = get_weibo_keywords(keywords_list)
            item['keywords_dict'] = json.dumps(keywords_dict) # use to compute
            item['keywords_string'] = keywords_string         # use to search

            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
                if sensitive_count_string: #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)

Esempio n. 13

0

Mostra file

File: get_sensitive.py Progetto: yuanhuiru/xnr2

def get_sensitive_user(timestamp, uid):
    score = 0
    query_body = {'query': {'term': {'uid': uid}}, 'size': 50}
    index_name = flow_text_index_name_pre + ts2datetime(timestamp)
    try:
        search_results = es_xnr.search(index=index_name,
                                       doc_type=flow_text_index_type,
                                       body=query_body)['hits']['hits']
    except Exception, e:
        pass
        search_results = []
    for result in search_results:
        text = result['_source']['text'].encode('utf-8')
        node = createWordTree()
        sensitive_words_dict = searchWord(text, node)
        if sensitive_words_dict:
            sensitive_words_list = []
            for k, v in sensitive_words_dict.iteritems():
                tmp_stage = r_sensitive.hget("sensitive_words", k)
                if tmp_stage:
                    score += v * sensitive_score_dict[str(tmp_stage)]
    return score


if __name__ == '__main__':
    # '2017-10-15'
    # print get_sensitive_user(timestamp=1507996800, uid='100003271864059')
    print get_sensitive_info(timestamp=1507996800,
                             mid='123124323',
                             text=u"64和达赖太阳花")

Esempio n. 14

0

Mostra file

                        text = text_ch[0]
                        item['text'] = text_ch[0]
                except:
                    pass

                #add sentiment field to weibo
                sentiment, keywords_list = triple_classifier(item)
                item['sentiment'] = str(sentiment)
                #add key words to weibo
                keywords_dict, keywords_string = get_weibo_keywords(
                    keywords_list)
                item['keywords_dict'] = json.dumps(
                    keywords_dict)  # use to compute
                item['keywords_string'] = keywords_string  # use to search

                sensitive_words_dict = searchWord(
                    text.encode('utf-8', 'ignore'), DFA)
                if sensitive_words_dict:
                    item['sensitive_words_string'] = "&".join(
                        sensitive_words_dict.keys())
                    item['sensitive_words_dict'] = json.dumps(
                        sensitive_words_dict)
                else:
                    item['sensitive_words_string'] = ""
                    item['sensitive_words_dict'] = json.dumps({})

                timestamp = item['timestamp']
                date = ts2datetime(timestamp)
                ts = datetime2ts(date)
                if sensitive_words_dict:
                    #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0]
                    sensitive_count_string = r_cluster.hget(

Esempio n. 15

0

Mostra file

File: zmq_work_weibo_sensitive.py Progetto: taozhiiq/user_portrait

def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        date = ts2datetime(timestamp)
        ts = datetime2ts(date)
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            print word
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))

    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """

Esempio n. 16

0

Mostra file

        if (update_user_te - update_user_ts) % 900 == 0:
            print 'update track user list'
            monitor_user_list = get_track_task_user()

        item = receiver.recv_json()
        #test
        if not item:
            continue

        if item['sp_type'] == '1':
            read_count += 1
            #print 'item:', item
            if str(item['uid']) in monitor_user_list:
                text = item['text']
                # add sensitive field and sensitive_word field to weibo
                sw_list = searchWord(text.encode('utf-8'))
                sensitive = len(sw_list)
                if sensitive:
                    item['sensitive'] = 1
                    word_set = set()
                    for w in sw_list:
                        word = ''.join([chr(x) for x in w])
                        word = word.decode('utf-8')
                        word_set.add(word)
                    sensitive_word_string = '&'.join(list(word_set))
                    item['sensitive_word'] = sensitive_word_string
                else:
                    item['sensitive'] = 0
                # add sentiment field to weibo
                sentiment = get_sentiment_attribute(text)
                item['sentiment'] = sentiment

Esempio n. 17

0

Mostra file

File: zmq_work_weibo_flow4.py Progetto: huxiaoqian/sensitive_user_portrait

            print 'update track user list'
            monitor_user_list = get_track_task_user()

        item = receiver.recv_json()
        #test
        if not item:
            continue 
        
        
        if item['sp_type'] == '1':
            read_count += 1
            #print 'item:', item
            if str(item['uid']) in monitor_user_list:
                text = item['text']
                # add sensitive field and sensitive_word field to weibo
                sw_list = searchWord(text.encode('utf-8'))
                sensitive = len(sw_list)
                if sensitive:
                    item['sensitive'] = 1
                    word_set = set()
                    for w in sw_list:
                        word = ''.join([chr(x) for x in w])
                        word = word.decode('utf-8')
                        word_set.add(word)
                    sensitive_word_string = '&'.join(list(word_set))
                    item['sensitive_word'] = sensitive_word_string
                else:
                    item['sensitive'] = 0
                # add sentiment field to weibo
                sentiment = get_sentiment_attribute(text)
                item['sentiment'] = sentiment

Esempio n. 18

0

Mostra file

File: zmq_work_weibo_sensitive.py Progetto: ystone1025/sensitive_user_portrait

def cal_propage_work(item, sensitive_words):
    cluster_redis = R_CLUSTER_FLOW1
    user = str(item['uid'])
    uid = str(item['uid'])
    followers_count = item['user_fansnum']
    friends_count = item.get("user_friendsnum", 0)
    cluster_redis.hset(user, 'user_fansnum', followers_count)
    cluster_redis.hset(user, 'user_friendsnum', friends_count)

    retweeted_uid = str(item['root_uid'])
    retweeted_mid = str(item['root_mid'])

    message_type = int(item['message_type'])
    mid = str(item['mid'])
    timestamp = item['timestamp']
    text = item['text']

    sw_list = searchWord(text.encode('utf-8'))
    sensitive_result = len(sw_list)
    if sensitive_result:
        ts = ts2datetime(timestamp).replace('-','')
        map = {}
        for w in sw_list:
            word = "".join([chr(x) for x in w])
            word = word.decode('utf-8')
            if not map.__contains__(word):
                map[word] = 1
            else:
                map[word] += 1
        try:
            sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
            sensitive_count_dict = json.loads(sensitive_count_string)
            for word in map:
                count = map[word]
                if sensitive_count_dict.__contains__(word):
                    sensitive_count_dict[word] += count
                else:
                    sensitive_count_dict[word] = count
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
        except:
            r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))


    if message_type == 1:
        cluster_redis.sadd('user_set', user)
        if sensitive_result:
            cluster_redis.hset('s_'+user, mid + '_origin_weibo_timestamp', timestamp)
        else:
            cluster_redis.hset(user, mid + '_origin_weibo_timestamp', timestamp)

    elif message_type == 2: # comment weibo
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_comment_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_comment_weibo', retweeted_mid):
            return

        #RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        #nicknames = RE.findall(text)

        if not sensitive_result:
            cluster_redis.sadd(user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby(retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset(retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby(str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby(str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset(str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """
        else:
            cluster_redis.sadd('s_' + user + '_comment_weibo', retweeted_mid)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_'+user, 'comment_weibo', 1)

            if 1:
            #if len(nicknames) == 0:
                cluster_redis.hincrby('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment', 1) 
                cluster_redis.hincrby('s_' + retweeted_uid, 'origin_weibo_comment_timestamp_%s' % queue_index, 1)
                cluster_redis.hset('s_' + retweeted_uid, retweeted_mid + '_origin_weibo_comment_timestamp', timestamp)
            """
            else:
                nick_id_ = nicknames[0]
                _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id_)
                print _id
                single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id_, _id)
                if _id:
                    cluster_redis.hincrby('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment', 1) 
                    cluster_redis.hincrby('s_' + str(_id), 'retweeted_weibo_comment_timestamp_%s' % queue_index, 1)
                    cluster_redis.hset('s_' + str(_id), retweeted_mid + '_retweeted_weibo_comment_timestamp', timestamp)
            """

    elif message_type == 3:
        cluster_redis.sadd('user_set', user)
        if cluster_redis.sismember(user + '_retweeted_weibo', retweeted_mid) or cluster_redis.sismember('s_' + user + '_retweeted_weibo', retweeted_mid):
            return
        """
        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        nicknames = RE.findall(text)
        """
        if not sensitive_result:
            cluster_redis.sadd(user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset(user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp) 
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby(retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' % queue_index, 1)    
            cluster_redis.hincrby(retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1) 
            """
            if len(nicknames) != 0:
                for nick_id in nicknames:
                    _id = single_redis.hget(NICK_UID_NAMESPACE, nick_id)
                    print _id
                    single_redis.hset(ACTIVE_NICK_UID_NAMESPACE, nick_id, _id)
                    if _id:
                        cluster_redis.hincrby(str(_id), retweeted_mid+'_retweeted_weibo_retweeted', 1) 
                        cluster_redis.hset(str(_id), 'retweeted_weibo_retweeted_timestamp', timestamp)
                        cluster_redis.hincrby(str(_id), 'retweeted_weibo_retweeted_timestamp_%s' % queue_index, 1)
            """
        else:
            cluster_redis.sadd('s_' + user + '_retweeted_weibo', retweeted_mid)
            cluster_redis.hset('s_' + user, retweeted_mid + '_retweeted_weibo_timestamp', timestamp)
            queue_index = get_queue_index(timestamp)
            cluster_redis.hincrby('s_' +retweeted_uid, 'origin_weibo_retweeted_timestamp_%s' %queue_index, 1)
            cluster_redis.hincrby('s_' +retweeted_uid, retweeted_mid + '_origin_weibo_retweeted', 1)
            """