Ejemplo n.º 1
0
def compute_group_inner(task_name, task_user, start_ts):
    #step1: get task_user in-monitor task user retweet relation from monitor_inner_r
    #step2: get task_user in-task user retweet relation
    #step3: compute every inner user be-retweet ratio in task
    #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date  e:'inner_2013-09-01'
    group_status = 0
    time_segment = 3600 * 24
    iter_time_segment = 900
    iter_ts = start_ts - time_segment
    inner_group_dict = {}
    user_count_dict = {}
    print 'group inner ask_user:'******'''
            if iter_ts >= start_ts:
                break
            '''
            key = 'inner_' + str(iter_ts)
            print 'iter_ts:', ts2date(iter_ts)
            inner_retweet_string = monitor_inner_r.hget(root_uid, key)
            print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string
            if inner_retweet_string:
                print 'yes'
                inner_retweet_dict = json.loads(inner_retweet_string)
            else:
                inner_retweet_dict = None
            if inner_retweet_dict:
                inner_group_dict[root_uid] = merge_dict(
                    inner_group_dict[root_uid], inner_retweet_dict)
            iter_ts += iter_time_segment
        user_inner_retweet_count = sum(inner_group_dict[root_uid].values())
        user_count_dict[root_uid] = user_inner_retweet_count
    all_be_retweet_count = sum(user_count_dict.values())
    if all_be_retweet_count == 0:
        group_status = 1
        return group_status
    sort_user_inner_retweet_count = sorted(user_count_dict.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
    top5_user = sort_user_inner_retweet_count[:5]

    # timestamp: '2013-09-01'
    date = ts2datetime(start_ts - 24 * 3600)
    index_body = {'date': date}
    for rank in range(1, 6):
        key = 'top' + str(rank)
        index_body[key] = json.dumps(top5_user[rank - 1])
    key = 'inner_' + date
    # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...}
    index_body['inner_graph'] = json.dumps(inner_group_dict)

    es.index(index=monitor_index_name,
             doc_type=task_name,
             id=key,
             body=index_body)
    group_status = 1
    return group_status
def compute_group_inner(task_name, task_user, start_ts):
    #step1: get task_user in-monitor task user retweet relation from monitor_inner_r
    #step2: get task_user in-task user retweet relation
    #step3: compute every inner user be-retweet ratio in task
    #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date  e:'inner_2013-09-01'
    group_status = 0
    time_segment = 3600*24
    iter_time_segment = 900
    iter_ts = start_ts - time_segment
    inner_group_dict = {}
    user_count_dict = {}
    print 'group inner ask_user:'******'''
            if iter_ts >= start_ts:
                break
            '''
            key = 'inner_' + str(iter_ts)
            print 'iter_ts:', ts2date(iter_ts)
            inner_retweet_string = monitor_inner_r.hget(root_uid, key)
            print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string
            if inner_retweet_string:
                print 'yes'
                inner_retweet_dict = json.loads(inner_retweet_string)
            else:
                inner_retweet_dict = None
            if inner_retweet_dict:
                inner_group_dict[root_uid] = merge_dict(inner_group_dict[root_uid], inner_retweet_dict)
            iter_ts += iter_time_segment
        user_inner_retweet_count = sum(inner_group_dict[root_uid].values())
        user_count_dict[root_uid] = user_inner_retweet_count
    all_be_retweet_count = sum(user_count_dict.values())
    if all_be_retweet_count==0:
        group_status = 1
        return group_status
    sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x:x[1], reverse=True)
    top5_user = sort_user_inner_retweet_count[:5]

    # timestamp: '2013-09-01'
    date = ts2datetime(start_ts - 24*3600)
    index_body = {'date': date}
    for rank in range(1,6):
        key = 'top' + str(rank)
        index_body[key] = json.dumps(top5_user[rank-1])
    key = 'inner_' + date
    # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...}
    index_body['inner_graph'] = json.dumps(inner_group_dict)
    
    es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body)
    group_status = 1
    return group_status
def save_mid_result_one(task_name, sensitive_weibo_dict, geo_weibo_dict, sentiment_weibo_dict, hashtag_weibo_dict, sensitive_word_dict, start_ts):
    status = 0
    insert_body = {}
    insert_body['count'] = json.dumps(sensitive_weibo_dict)
    insert_body['geo'] = json.dumps(geo_weibo_dict)
    insert_body['sentiment'] = json.dumps(sentiment_weibo_dict)
    if hashtag_weibo_dict != {}:
        insert_body['hashtag'] = json.dumps(hashtag_weibo_dict)
    if sensitive_word_dict != {}:
        insert_body['sensitive_word'] = json.dumps(sensitive_word_dict)
    insert_body['timestamp'] = start_ts # mark the ts
    
    es.index(index=monitor_index_name, doc_type=task_name, id=start_ts, body=insert_body)
    status = 1
    return status
def save_mid_result_group(task_name, sensitive_weibo_dict, geo_weibo_dict, sentiment_weibo_dict, hashtag_weibo_dict, sensitive_word_dict, start_ts):
    status = 0
    print 'start save result to es'
    insert_body = {}
    insert_body['count'] = json.dumps(sensitive_weibo_dict)
    insert_body['geo'] = json.dumps(geo_weibo_dict)
    insert_body['sentiment'] = json.dumps(sentiment_weibo_dict)
    if hashtag_weibo_dict != {}:
        insert_body['hashtag'] = json.dumps(hashtag_weibo_dict)
    if sensitive_word_dict != {}:
        insert_body['sensitive_word'] = json.dumps(sensitive_word_dict)
    insert_body['timestamp'] = start_ts
    # other attribute about monitor group should be add
    es.index(index=monitor_index_name, doc_type=task_name, id=start_ts, body=insert_body)
    status = 1
    print 'end save result'
    return status
def init_custom_attribute():
    index_info = {
            'settings':{
                'analysis':{
                    'analyzer':{
                        'my_analyzer':{
                            'type': 'pattern',
                            'pattern': '&'
                            }
                        }
                    }
                },
            'mappings':{
                'attribute':{
                    'properties':{
                        'attribute_name':{
                            'type': 'string',
                            'index': 'not_analyzed'
                            },
                        'attribute_value':{
                            'type': 'string',
                            'analyzer': 'my_analyzer'
                            },
                        'date':{
                            'type': 'string',
                            'index': 'not_analyzed'
                            },
                        'user':{
                            'type': 'string',
                            'index': 'not_analyzed'
                            }
                        }
                    }
                }
            }
    flag = es.indices.exists(index=attribute_index_name)
    if flag:
        es.indices.delete(index=attribute_index_name)
    es.indices.create(
            index = attribute_index_name,
            body = index_info,
            ignore = 400
            )
    #test
    es.index(index=attribute_index_name, doc_type=attribute_index_type, id='test_tag', body={'attribute_name':'test_tag', 'attribute_value':'tag1&tag2', 'date':'2013-09-08', 'user':'******'})
Ejemplo n.º 6
0
def save_mid_result_one(task_name, sensitive_weibo_dict, geo_weibo_dict,
                        sentiment_weibo_dict, hashtag_weibo_dict,
                        sensitive_word_dict, start_ts):
    status = 0
    insert_body = {}
    insert_body['count'] = json.dumps(sensitive_weibo_dict)
    insert_body['geo'] = json.dumps(geo_weibo_dict)
    insert_body['sentiment'] = json.dumps(sentiment_weibo_dict)
    if hashtag_weibo_dict != {}:
        insert_body['hashtag'] = json.dumps(hashtag_weibo_dict)
    if sensitive_word_dict != {}:
        insert_body['sensitive_word'] = json.dumps(sensitive_word_dict)
    insert_body['timestamp'] = start_ts  # mark the ts

    es.index(index=monitor_index_name,
             doc_type=task_name,
             id=start_ts,
             body=insert_body)
    status = 1
    return status
Ejemplo n.º 7
0
def save_mid_result_group(task_name, sensitive_weibo_dict, geo_weibo_dict,
                          sentiment_weibo_dict, hashtag_weibo_dict,
                          sensitive_word_dict, start_ts):
    status = 0
    print 'start save result to es'
    insert_body = {}
    insert_body['count'] = json.dumps(sensitive_weibo_dict)
    insert_body['geo'] = json.dumps(geo_weibo_dict)
    insert_body['sentiment'] = json.dumps(sentiment_weibo_dict)
    if hashtag_weibo_dict != {}:
        insert_body['hashtag'] = json.dumps(hashtag_weibo_dict)
    if sensitive_word_dict != {}:
        insert_body['sensitive_word'] = json.dumps(sensitive_word_dict)
    insert_body['timestamp'] = start_ts
    # other attribute about monitor group should be add
    es.index(index=monitor_index_name,
             doc_type=task_name,
             id=start_ts,
             body=insert_body)
    status = 1
    print 'end save result'
    return status
def main():
    index_info = {
        'settings': {
            'analysis': {
                'analyzer': {
                    'my_analyzer': {
                        'type': 'pattern',
                        'pattern': '&'
                    }
                }
            }
        },
        'mappings': {
            'text': {
                'properties': {
                    'text': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'mid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'ip': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'timestamp': {
                        'type': 'long'
                    },
                    'sentiment': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'geo': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'message_type': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'uid': {
                        'type': 'string',
                        'index': 'not_analyzed'
                    },
                    'hashtag': {
                        'type': 'string',
                        'analyzer': 'my_analyzer'
                    },
                    'sensitive_word': {
                        'type': 'string',
                        'analyzer': 'my_analyzer'
                    }
                }
            }
        }
    }
    es.indices.create(index='monitor_user_text', body=index_info, ignore=400)
    es.index(index='monitor_user_text',
             doc_type='text',
             id='test',
             body={'uid': 'test'})
def main():
    index_info = {
        'settings':{
            'analysis':{
                'analyzer':{
                    'my_analyzer':{
                        'type': 'pattern',
                        'pattern': '&'
                        }
                    }
                }
            },
        'mappings':{
            'text':{
                'properties':{
                    'text':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'mid':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'ip':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'timestamp':{
                        'type': 'long'
                        },
                    'sentiment':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'geo':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'message_type':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'uid':{
                        'type': 'string',
                        'index': 'not_analyzed'
                        },
                    'hashtag':{
                        'type': 'string',
                        'analyzer': 'my_analyzer'
                        },
                    'sensitive_word':{
                        'type': 'string',
                        'analyzer': 'my_analyzer'
                        }

                    }
                }
            }
        }
    es.indices.create(index='monitor_user_text', body=index_info, ignore=400)
    es.index(index='monitor_user_text', doc_type='text', id='test', body={'uid': 'test'})