Beispiel #1
0
def save_ws_results_es(topic, ts, during, n_limit, province, city, weibos):

    #mappings_event_geo_province_weibos()
    #index_name = index_event_geo_province_weibos
    #index_type = type_event_geo_province_weibos

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    item = {}

    item['en_name'] = topic
    item['end_ts'] = ts
    item['range'] = during
    item['limit'] = n_limit
    item['province'] = province
    item['city'] = city
    item['weibo'] = json.dumps(weibos)

    id = topic + '_' + ts

    try:
        item_exist = weibo_es.get(index=index_name, doc_type=index_type,
                                  id=id)['_source']
        weibo_es.update(index=index_name,
                        doc_type=index_type,
                        id=id,
                        body={'doc': item})
    except Exception, e:
        weibo_es.index(index=index_name, doc_type=index_type, id=id, body=item)
Beispiel #2
0
def save_long_gexf(topic, identifyDate, identifyWindow, identifyGexf):
    index_name = topic + '_gexffile'

    get_graph_mappings(index_name)

    bulk_action = []
    #action = {"index":{"_id":999}}
    source = json.dumps(identifyGexf)
    action = {
        #"index":{"_id":999},
        #"_source":{
        "name": str(identifyDate) + str(identifyWindow),
        "gexf": source,
        "date": str(identifyDate),
        "window": identifyWindow,
        #}
    }
    bulk_action.extend([
        action,
    ])
    print bulk_action
    auto_id = [
        str(i) for i in str(identifyDate) + str(identifyWindow) if i.isdigit()
    ]
    auto_id = ''.join(auto_id)
    #es.bulk(bulk_action, index=index_name, doc_type='text', timeout=600)
    es.index(index=index_name, doc_type='text', id=auto_id, body=action)
def save_rt_results_es(topic, repost_list):

    #mappings_event_geo_city_repost()
    #index_name = index_event_geo_city_repost
    #index_type = type_event_geo_city_repost

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    item = {}

    for location in repost_list:

        item['en_name'] = topic
        item['original'] = location['original']
        item['mid'] = location['mid']
        item['timestamp'] = location['ts']
        item['origin_location'] = location['origin_location']
        item['repost_location'] = location['repost_location']
        id = location['mid']
        try:
            item_exist = weibo_es.get(index=index_name,
                                      doc_type=index_type,
                                      id=id)['_source']
            weibo_es.update(index=index_name,
                            doc_type=index_type,
                            id=id,
                            body={'doc': item})
        except Exception, e:
            weibo_es.index(index=index_name,
                           doc_type=index_type,
                           id=id,
                           body=item)
Beispiel #4
0
def save_rt_results_es(calc,
                       topic,
                       results,
                       during,
                       klimit=TOP_KEYWORDS_LIMIT,
                       wlimit=TOP_WEIBOS_LIMIT):

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    if calc == 'count':  #{时间段:{情绪1:值1,情绪2,值2}}{时间段:{情绪1:值1,情绪2,值2}}

        #mappings_event_sentiment_count()
        #index_name = index_event_sentiment_count
        #index_type = type_event_sentiment_count

        item = {}

        for time, sen_dict in results.iteritems():
            id = topic + '_' + time
            for sentiment, count in sen_dict.iteritems():
                item['en_name'] = topic
                item['end_ts'] = time
                item['range'] = during
                item['sentiment'] = sentiment
                item['count'] = count

                try:
                    item_exist = weibo_es.get(index=index_name,
                                              doc_type=index_type,
                                              id=id)['_source']
                    weibo_es.update(index=index_name,
                                    doc_type=index_type,
                                    id=id,
                                    body={'doc': item})
                except Exception, e:
                    # raise e
                    weibo_es.index(index=index_name,
                                   doc_type=index_type,
                                   id=id,
                                   body=item)
Beispiel #5
0
def save_results_es(topic, language_results):

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    id = topic

    try:
        item_exist = weibo_es.get(index=index_name, doc_type=index_type,
                                  id=id)['_source']
        weibo_es.update(index=index_name,
                        doc_type=index_type,
                        id=id,
                        body={'doc': {
                            'language_results': language_results
                        }})
    except Exception, e:
        weibo_es.index(index=index_name,
                       doc_type=index_type,
                       id=id,
                       body={'language_results': language_results})
def save_first_nodes_es(topic,
                        date,
                        windowsize,
                        uid,
                        timestamp,
                        user_info,
                        weibo_info,
                        user_domain='other'):

    #mappings_event_network_first_user()
    #index_name = index_event_network_first_user
    #index_type = type_event_network_first_user

    #mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    item = {}

    item['en_name'] = topic
    item['date'] = date
    item['windowsize'] = windowsize
    item['uid'] = uid
    item['timestamp'] = timestamp
    item['user_info'] = json.dumps(user_info)
    item['weibo_info'] = json.dumps(weibo_info)
    item['user_domain'] = user_domain

    id = uid

    try:
        item_exist = weibo_es.get(index=index_name, doc_type=index_type,
                                  id=id)['_source']
        weibo_es.update(index=index_name,
                        doc_type=index_type,
                        id=id,
                        body={'doc': item})
    except:
        weibo_es.index(index=index_name, doc_type=index_type, id=id, body=item)
Beispiel #7
0
def save_long_gexf(topic, identifyDate, identifyWindow, identifyGexf):
	index_name = topic+'_gexffile'
	
	get_graph_mappings(index_name)
	
	bulk_action = []
	#action = {"index":{"_id":999}}
	source = json.dumps(identifyGexf)
	action = {
    			#"index":{"_id":999},
				#"_source":{
				"name":str(identifyDate)+str(identifyWindow),
				"gexf":source,
				"date":str(identifyDate),
				"window":identifyWindow,
				#}
			}
	bulk_action.extend([action,])
	print bulk_action
	auto_id = [str(i)for i in str(identifyDate)+str(identifyWindow) if i.isdigit()]
	auto_id = ''.join(auto_id)
	#es.bulk(bulk_action, index=index_name, doc_type='text', timeout=600)
	es.index(index=index_name, doc_type='text', id=auto_id, body=action)
Beispiel #8
0
def save_rt_results_es(topic, results, during, first_item):

    #mappings_event_geo_city_topic_count()
    #index_name = index_event_geo_city_topic_count
    #index_type = type_event_geo_city_topic_count

    mappings_event_analysis_results(topic)
    index_name = index_event_analysis_results
    index_type = type_event_analysis_results

    item = {}

    for mtype, time_geo in results.iteritems(
    ):  ##{'message_type':[timestamp,{['province':('provice':cishu),()],'city':[(city:cishu)}]}
        item['en_name'] = topic
        item['end_ts'] = time_geo[0]
        item['range'] = during
        item['mtype'] = mtype
        item['ccount'] = time_geo[1]
        item['first_item'] = first_item

        id = topic + '_' + ts

        try:
            item_exist = weibo_es.get(index=index_name,
                                      doc_type=index_type,
                                      id=id)['_source']
            weibo_es.update(index=index_name,
                            doc_type=index_type,
                            id=id,
                            body={'doc': item})
        except Exception, e:
            weibo_es.index(index=index_name,
                           doc_type=index_type,
                           id=id,
                           body=item)
                item['range'] = during
                item['mtype'] = mtype
                item['limit'] = klimit
                item['kcount'] = json.dumps(keyword_dict)

                try:
                    item_exist = weibo_es.get(index=index_name,
                                              doc_type=index_type,
                                              id=id)['_source']
                    weibo_es.update(index=index_name,
                                    doc_type=index_type,
                                    id=id,
                                    body={'doc': item})
                except Exception, e:
                    weibo_es.index(index=index_name,
                                   doc_type=index_type,
                                   id=id,
                                   body=item)

    elif calc == 'weibo':
        #mappings_event_time_weibo()
        #index_name = index_event_time_weibo
        #index_type = type_event_time_weibo

        item = {}
        for time, mtype_dict in results.iteritems():
            id = topic + '_' + time
            for mtype, weibo in mtype_dict.iteritems():
                item['en_name'] = topic
                item['end_ts'] = time
                item['range'] = during
                item['mtype'] = mtype
Beispiel #10
0
def weibo_comments_list(taskid,
                        start_ts,
                        over_ts,
                        weibo_list,
                        cluster_num=-1,
                        cluster_eva_min_size=default_cluster_eva_min_size,
                        vsm=default_vsm,
                        calculation_label=1):  #weibo_list把微博读进来

    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    task_result_file = os.path.join(RESULT_WEIBO_FOLDER, taskid)
    if os.path.exists(task_result_file) and calculation_label == 0:
        # 从已有数据文件加载结果集
        with open(task_result_file) as dump_file:
            dump_dict = json.loads(dump_file.read())
            ratio_results = dump_dict["ratio"]
            sentiratio_results = dump_dict["sentiratio"]
            before_filter_count = dump_dict["before_filter_count"]
            after_filter_count = dump_dict["after_filter_count"]

        return json.dumps({"ratio": ratio_results, "sentiratio": sentiratio_results, \
                "before_filter_count": before_filter_count, "after_filter_count": after_filter_count})

    comments = weibo_list
    print 'weibo_list:', len(comments)
    logfile = os.path.join(LOG_WEIBO_FOLDER, taskid + '.log')
    cal_results = weibo_calculation(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    download_items = []
    for comment in item_infos:
        #print comment["clusterid"]
        download_item = {}
        #comment = item_infos[comment]
        download_item["id"] = comment["id"]
        download_item["text"] = comment["text"]
        download_item["clusterid"] = comment["clusterid"]
        download_item["ad_label"] = comment["ad_label"]
        download_item["comment"] = comment["comment"]
        download_item["datetime"] = comment["datetime"]
        download_item["retweeted"] = comment["retweeted"]
        download_item["uid"] = comment["uid"]
        # download_item["same_from"] = comment["same_from"]
        download_items.append(download_item)
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        # if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
        #         and (comment['clusterid'][:8] != 'nonsense'):
        #     sentiment = comment['sentiment']

        #     try:
        #         senti_ratio[sentiment] += 1
        #     except KeyError:
        #         senti_ratio[sentiment] = 1
        #     try:
        #         sentiment_results[sentiment].append(comment)
        #     except KeyError:
        #         sentiment_results[sentiment] = [comment]

        #     after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    #jln0825 没有情感的东西 不要了
    # sentiratio_results = dict()
    # sentiratio_total_count = sum(senti_ratio.values())
    # for sentiment, ratio in senti_ratio.iteritems():
    #     if sentiment in emotions_vk_v1:
    #         label = emotions_vk_v1[sentiment]
    #         if label and len(label):
    #             sentiratio_results[label] = float(ratio) / float(sentiratio_total_count)

    # # 情感分类去重
    # sentiment_dump_dict = dict()
    # for sentiment, contents in sentiment_results.iteritems():
    #     dump_dict = dict()
    #     for comment in contents:
    #         same_from_sentiment = comment["same_from_sentiment"]
    #         try:
    #             dump_dict[same_from_sentiment].append(comment)
    #         except KeyError:
    #             dump_dict[same_from_sentiment] = [comment]
    #     sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        #print clusterid
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    #task = taskid.split('_')
    for key in features.keys():
        print features[key], type(features[key])
        keys = ('_').join(features[key])
        #index_body={'name':task[0],'start_ts':task[1],'end_ts':task[2],'ratio':json.dumps(ratio_results),'cluster':json.dumps(key),'features':json.dumps(features),'keys':keys,'cluster_dump_dict':json.dumps(cluster_dump_dict[key])}
        index_body = {
            'name': taskid,
            'start_ts': start_ts,
            'end_ts': over_ts,
            'ratio': json.dumps(ratio_results),
            'cluster': json.dumps(key),
            'features': json.dumps(features),
            'keys': keys,
            'cluster_dump_dict': json.dumps(cluster_dump_dict[key])
        }
        #print index_body
        #print subopinion_index_type,subopinion_index_name
        #jln  0907
        weibo_es.index(index=subopinion_index_name,
                       doc_type=subopinion_index_type,
                       id=key,
                       body=index_body)

    return json.dumps({
        "features": features,
        "ratio": ratio_results,
        "cluster_dump_dict": cluster_dump_dict
    })  #features关键词和类的对应
Beispiel #11
0
def news_comments_list(taskid,
                       start_ts,
                       over_ts,
                       weibo_list,
                       cluster_num=-1,
                       cluster_eva_min_size=default_cluster_eva_min_size,
                       vsm=default_vsm,
                       calculation_label=1):  #weibo_list把微博读进来
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    taskid = request.args.get('taskid', default_task_id)
    cluster_num = request.args.get('cluster_num', default_cluster_num) #若无此参数,取-1;否则取用户设定值
    if cluster_num == default_cluster_num:
        cluster_num = -1
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)
    calculation_label = int(request.args.get('calcu', 1)) # 默认进行重新计算, 0表示从从已有结果数据文件加载数据
    """
    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    comments = weibo_list
    logfile = os.path.join(LOG_FOLDER, taskid + '.log')

    cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    #print cal_results
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']
    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    download_items = []
    for comment in item_infos:
        # print comment
        download_item = {}
        download_item["id"] = comment["id"]
        download_item["title"] = comment["title"]
        download_item["text"] = comment["text"]
        # download_item["timestamp"] = comment["timestamp"]
        download_item["datetime"] = comment["datetime"]
        download_item["clusterid"] = comment["clusterid"]
        download_item["sentiment"] = comment["sentiment"]
        download_item["ad_label"] = comment["ad_label"]
        if (comment["clusterid"][:8] != 'nonsense') and (comment["clusterid"]
                                                         != 'other'):
            download_item["duplicate"] = comment["duplicate"]
            download_item["same_from"] = comment["same_from"]
        download_items.append(download_item)
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

            after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    for k, v in dump_dict.iteritems():
                        sort_dump_dict = sorted(v,
                                                key=lambda x: x['weight'],
                                                reverse=True)
                    cluster_dump_dict[clusterid] = sort_dump_dict

    # dump_file = open(task_result_file+'_news', 'w')
    # dump_file.write(json.dumps({"params": params, "features":features, "senti_dump_dict":sentiment_dump_dict, \
    #         "cluster_dump_dict":cluster_dump_dict, "ratio":ratio_results, "sentiratio": sentiratio_results, \
    #         "before_filter_count": before_filter_count, "after_filter_count": after_filter_count}))
    # dump_file.close()
    # new_file = open(task_result_file+'_news_2','w')
    # print task_result_file+'2'  #所有的微博
    # for i in xrange(0,len(download_items)):
    #     new_file.write(json.dumps(download_items[i])+'\n')
    # new_file.close
    #task = taskid.split('_')
    #index_body={'name':task[0],'start_ts':task[1],'end_ts':task[2],'features':json.dumps(features),'cluster_dump_dict':json.dumps(cluster_dump_dict)}

    index_body = {
        'name': taskid,
        'start_ts': start_ts,
        'end_ts': over_ts,
        'features': json.dumps(features),
        'cluster_dump_dict': json.dumps(cluster_dump_dict)
    }
    weibo_es.index(index=topics_river_index_name,
                   doc_type=topics_river_index_type,
                   id=taskid,
                   body=index_body)

    return json.dumps({
        "features": features,
        "cluster_dump_dict": cluster_dump_dict
    })
def compute_topic_task():
    print time.time()
    while True:
        #print r.rpop(topic_queue_name)
        task = r.rpop('event_portrait_task')
        #if not task:
        #   break
        if task:
            continue
        else:
            # task = json.loads(task)
            task = ['雾霾', 'type', '1480003100', '1480176000', '1483500427743']
            topic = task[0]  #['name']
            #en_name = task['en_name']
            start_ts = int(task[2])  #timestamp
            end_ts = int(task[3])  #timestamp
            submit_ts = int(task[4])
            try:
                keywords = task['keywords']
            except:
                keywords = ''
            #comput_status = task['status']

            task_id = 'event-' + str(start_ts) + '-' + str(end_ts) + '-' + str(
                submit_ts)
            en_name = task_id
            t1 = time.time()
            exist_flag = exist(task_id)
            #keywords=keywords.split('&')
            get_topic_weibo(topic, task_id, start_ts, end_ts, keywords)
            print exist_flag
            if exist_flag:
                #start compute
                #try:
                weibo_counts, uid_counts = counts(start_ts, end_ts, topic,
                                                  en_name, keywords)
                count_fre(en_name,
                          start_ts=start_ts,
                          over_ts=end_ts,
                          news_limit=NEWS_LIMIT,
                          weibo_limit=MAX_LANGUAGE_WEIBO)

                weibo_es.index(index='topics',
                               doc_type='text',
                               id=task_id,
                               body={
                                   'name': topic,
                                   'start_ts': start_ts,
                                   'end_ts': end_ts,
                                   'submit_ts': submit_ts,
                                   'comput_status': 0,
                                   'en_name': task_id
                               })
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={
                                    'doc': {
                                        'comput_status': -1,
                                        'weibo_counts': weibo_counts,
                                        'uid_counts': uid_counts
                                    }
                                })
                print 'finish change status'
                #geo

                repost_search(en_name, start_ts, end_ts)
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -2
                                }})
                print 'finish geo_1 analyze'
                cityTopic(en_name, start_ts, end_ts)
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -3
                                }})
                print 'finish geo analyze'
                #language
                count_fre(en_name,
                          start_ts=start_ts,
                          over_ts=end_ts,
                          news_limit=NEWS_LIMIT,
                          weibo_limit=MAX_LANGUAGE_WEIBO)
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -4
                                }})
                print 'finish language analyze'
                #time
                propagateCronTopic(en_name, start_ts, end_ts)
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -5
                                }})
                print 'finish time analyze'

                #network
                compute_network(en_name, start_ts, end_ts)
                weibo_es.update(index=topic_index_name,
                                doc_type=topic_index_type,
                                id=task_id,
                                body={'doc': {
                                    'comput_status': -6
                                }})
                print 'finish network analyze'

                #sentiment
                sentimentTopic(en_name, start_ts=start_ts, over_ts=end_ts)
                print 'finish sentiment analyze'
                #finish compute

                print weibo_es.update(index=topic_index_name,
                                      doc_type=topic_index_type,
                                      id=task_id,
                                      body={
                                          'doc': {
                                              'comput_status': 1,
                                              'finish_ts': int(time.time())
                                          }
                                      })
                save_to_es(task_id, start_ts, end_ts, submit_ts, weibo_counts,
                           uid_counts)
                print 'finish change status done'
            break
        t2 = time.time() - t1
        print task_id, t2
def compute_topic_task():

    create_task()

    index_name = index_manage_event_analysis
    index_type = type_manage_event_analysis

    index_name_results = index_event_analysis_results
    index_type_results = type_event_analysis_results
    '''
    while  True:
        #print r.rpop(topic_queue_name)

        task_detail = r_event_analysis.rpop(task_event_analysis)

        #if not task_detail:
        #    break

        if  task_detail:

            break

        else:
    
            task_detail = json.loads(task_detail)
            topic = task_detail[0]
            en_name = task_detail[1]
            start_ts = task_detail[2]
            end_ts = task_detail[3]
            #keywords = task_detail[4]
            #event_value_finish = task['event_value_finish']
            #mappings_event_analysis_results(en_name)
          
            print 'start scan!!'
            while 1:
                es_result = weibo_es.get(index=index_name, doc_type=index_type, id=en_name)["_source"]
                if int(es_result["scan_text_finish"]) == 2:
                    break  #跳出该循环,接着往下执行。
                else:
                    time.sleep(60)   #等待扫描完成(int(es_result["scan_text_finish"]) == 2)
            
            t1=time.time()
    '''
    t1 = time.time()
    '''
    topic = '天津老太摆射击摊被判刑' #'毛泽东诞辰纪念日'
    en_name = 'tian_jin_lao_tai_she_ji_qiang_bei_pan_xing' #"mao_ze_dong_dan_chen_ji_nian_ri"
    start_ts = 1482768502 #1482681600
    end_ts = 1483455435 #1483113600
    must_keywords = ["射击","判刑"] #['毛泽东']
    should_keywords = ["天津","老太"] #['诞辰','纪念日']
    #submit_time = time.time()
    submit_user = '******'
    '''

    topic = '毛泽东诞辰纪念日'
    en_name = "mao_ze_dong_dan_chen_ji_nian_ri"
    start_ts = 1482681600
    end_ts = 1483113600
    must_keywords = ['毛泽东']
    should_keywords = ['诞辰', '纪念日']
    #submit_time = time.time()
    submit_user = '******'

    #start computes

    weibo_es.update(index=index_name,
                    doc_type=index_type,
                    id=en_name,
                    body={'doc': {
                        'event_value_finish': 1
                    }})

    #try:
    #weibo_counts,uid_counts=counts(start_ts,end_ts,topic,en_name,keywords)
    weibo_counts, uid_counts = counts_aggs(en_name, start_ts, end_ts)
    #weibo_es.index(index='topics',doc_type='text',id=en_name,body={'name':topic,'start_ts':start_ts,'end_ts':end_ts,'submit_ts':submit_ts,'comput_status':0,'en_name':en_name})
    #weibo_es.update(index=index_name,doc_type=index_type,id=en_name,body={'doc':{'event_value_finish':-1,'weibo_counts':weibo_counts,'uid_counts':uid_counts}})
    print 'finish change status'

    item = {}
    item['topic'] = topic
    item['en_name'] = en_name
    item['start_time'] = start_ts
    item['stop_time'] = end_ts
    item['weibo_counts'] = weibo_counts
    item['uid_counts'] = uid_counts
    item['must_keywords'] = must_keywords
    item['should_keywords'] = should_keywords
    item['submit_user'] = submit_user
    #item['submit_time'] = submit_time

    weibo_es.index(index=index_name_results,
                   doc_type=index_type_results,
                   id=en_name,
                   body=item)

    #time
    time_results = propagateCronTopic(en_name, start_ts, end_ts)
    #{'during': ,'count':{},'kcount':{},'weibo':{}}
    time_results = json.dumps(time_results)

    #weibo_es.update(index=index_name,doc_type=index_type,id=en_name,body={'doc':{'event_value_finish':-2}})
    print 'finish time analyze'

    #geo
    sort_ts_attr, repost_list = repost_search(en_name, start_ts, end_ts)
    #对每条微博得到转微博、mid、话题、时间、原地理位置、转发地理位置
    #repost_list数组中每一项: {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx}

    #weibo_es.update(index=index_name,doc_type=index_type,id=en_name,body={'doc':{'event_value_finish':-}})
    print 'finish geo_1 analyze'
    geo_cityTopic_results = cityTopic(en_name, start_ts, end_ts)

    # {'geo_weibos':{},'geo_cityCount':{}}
    geo_results = {
        'sort_ts_attr': sort_ts_attr,
        'repost_list': repost_list,
        'geo_cityTopic_results': geo_cityTopic_results
    }
    geo_results = json.dumps(geo_results)
    id = en_name
    try:
        item_exist = weibo_es.get(index=index_name, doc_type=index_type,
                                  id=id)['_source']
        weibo_es.update(index=index_name_results,
                        doc_type=index_type_results,
                        id=id,
                        body={'doc': {
                            'geo_results': geo_results
                        }})
    except Exception, e:
        weibo_es.index(index=index_name_results,
                       doc_type=index_type_results,
                       id=id,
                       body={'geo_results': geo_results})
Beispiel #14
0
def compute_network(topic, start_ts, end_ts):
    '''
    topics = _topic_not_calc() # topics=[{id:x,module:x,status:x,topic:x,start:x,end:x,db_date:x}]
    '''
    '''
    topic_status_info = db.session.query(TopicStatus).filter(TopicStatus.topic==topic ,\
                                                             TopicStatus.start==start_ts ,\
                                                             TopicStatus.end==end_ts ,\
                                                             TopicStatus.module=='identify' ,\
                                                             TopicStatus.status==-1).first()
    if topic_status_info:
        #topic = topics[0] # 每次只计算一个----为了做一个缓冲,每个n时间才计算一个
        print 'topic_id', topic_status_info.id
        start_ts = topic_status_info.start
        end_ts = topic_status_info.end
        db_date = topic_status_info.db_date
        topicname = topic
        _update_topic_status2Computing(topicname, start_ts, end_ts, db_date)
        print 'update_status'
        topic_id = acquire_topic_id(topicname, start_ts, end_ts) # 重新获取id是因为TopicStatus中id是自增加的,进行更新后,id就不是原来的那一个了
        windowsize = (end_ts - start_ts) / Day # 确定时间跨度的大小
        date = ts2datetime(end_ts)
        '''

    #改动的地方从es表中读取话题的拼音也就是表名
    network_results = {}

    if True:
        print end_ts, type(end_ts)
        #topicname = topic
        date = ts2datetime(end_ts)
        windowsize = (end_ts - start_ts) / Day  # 确定时间跨度的大小
        topic_pinyin_name = topic
        # print 'start topic_name_transfer'   #把汉字的时间名换成拼音 奥运会>aoyunhui
        # topic_pinyin_name = weibo_TopicNameTransfer(topicname, start_ts, end_ts)
        # print topic_pinyin_name

        print 'start compute first_nodes'
        #start_date = ts2datetime(start_ts) # used to compute the first user
        first_node_results = get_first_node(topic_pinyin_name, start_ts,
                                            end_ts, windowsize, date)
        print 'end compute first_nodes'

        network_results['first_node_results'] = first_node_results

        print 'start make network'
        max_size = MAX_SIZE
        attribute_add = True
        g, gg, new_attribute_dict = make_network(topic_pinyin_name, date,
                                                 windowsize, max_size,
                                                 attribute_add)
        #print g,gg,new_attribute_dict

        network_results['new_attribute_dict'] = new_attribute_dict

        print 'write gexf file'
        #real_topic_id = acquire_real_topic_id(topicname, start_ts, end_ts)
        real_topic_id = topic_pinyin_name
        if not real_topic_id:
            print 'the topic not exist'
            return None
        key = str(real_topic_id) + '_' + str(date) + '_' + str(windowsize)
        print 'gexf_file:', str(GRAPH_PATH) + str(key) + '_g_graph.gexf'
        #fh = open(str(GRAPH_PATH) + str(key) + '_g_graph.gexf', 'w+')
        #fh.close()
        #fh = open(str(GRAPH_PATH) + str(key) + '_gg_graph.gexf', 'w+')
        #fh.close()
        nx.write_gexf(g, str(GRAPH_PATH) + str(key) + '_g_graph.gexf')
        nx.write_gexf(gg, str(GRAPH_PATH) + str(key) + '_gg_graph.gexf')
        #nx.write_gexf(ds_dg, str(GRAPH_PATH) + str(key) + '_ds_dg_graph.gexf')
        #nx.write_gexf(ds_udg, str(GRAPH_PATH) + str(key) + '_ds_udg_graph.gexf')
        #这里要改一下 不用SSDB了
        #save_attribute_dict(new_attribute_dict, 'g')
        #save_attribute_dict(ds_new_attribute_dict, 'ds_g')
        print 'end make network'

        print 'start PageRank'
        all_uid_pr, data_dict, sorted_uids = pagerank_rank(
            TOPK, date, windowsize, topic_pinyin_name)
        network_results['pagerank'] = {}
        network_results['pagerank']['all_uid_pr'] = all_uid_pr
        network_results['pagerank']['sorted_uids'] = sorted_uids
        print 'len(all_uid_pr):', len(all_uid_pr)
        print 'end PageRank'

        print 'start make network graph'
        #topic_id = int(topic_id)
        windowsize = int(windowsize)
        if not topic_pinyin_name:  # 待删
            gexf = ''
        else:
            gexf= make_network_graph(date, topic_pinyin_name, windowsize, all_uid_pr, data_dict,sorted_uids,\
                new_attribute_dict)
            #gexf = json.dumps(gexf)
        print 'save gexf'
        #print '*************************'*10
        #print gexf
        #print '*************************'*10
        long_gexf = save_gexf_results(topic_pinyin_name, date, windowsize,
                                      gexf, gexf_type)

        network_results['long_gexf'] = long_gexf

        print 'start fu_tr'
        maker_results, pusher_results = get_interval_count(
            topic_pinyin_name, date, windowsize)
        print 'update_topic_end'
        #db_date = date
        #_update_topic_status2Completed(topic_pinyin_name, start_ts, end_ts, db_date)
        network_results['maker_results'] = maker_results
        network_results['pusher_results'] = pusher_results

        index_name = index_event_analysis_results
        index_type = type_event_analysis_results

        network_results = json.dumps(network_results)

        id = topic

        try:
            tem_exist = weibo_es.get(index=index_name,
                                     doc_type=index_type,
                                     id=id)['_source']
            weibo_es.update(index=index_name,
                            doc_type=index_type,
                            id=id,
                            body={'doc': {
                                'network_results': network_results
                            }})
        except Exception, e:
            weibo_es.index(index=index_name,
                           doc_type=index_type,
                           id=id,
                           body={'network_results': network_results})

        print 'network_results save done!!'

        print 'all done!'