コード例 #1
0
def create_task_list(given_ts):
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    now_ts = datehour2ts(ts2datehour(time.time() - 3600))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {"query": {"match_all": {}}}

    search_results = es.search(index=index_sensing,
                               doc_type=type_sensing,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name'])  # task_name
            task.append(json.loads(item['social_sensors']))  # social sensors
            #task.append(now_ts)
            task.append(given_ts)
            r.lpush('task_name', json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log
コード例 #2
0
def create_task():
    ts = time.time()
    if RUN_TYPE:
        current_ts = datehour2ts(ts2datehour(ts))
    else:
        current_ts = 1482861600
    query_body = {
        "query": {
            "term":{"finish":"0"}
        },
        "size":10000
    }

    results = es_prediction.search(index=index_manage_prediction_task,\
            doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"]
    for item in results:
        print item
        task_name = item["_source"]["pinyin_task_name"]
        stop_time = item["_source"]["stop_time"]
        print stop_time, current_ts
        if stop_time < current_ts:
            es_prediction.update(index=index_manage_prediction_task,\
                    doc_type=type_manage_prediction_task, id=task_name,  body={"doc":{"macro_trendline_finish":"1", "finish": "1"}})
        else:
            r_trendline.lpush(task_trendline, task_name)
コード例 #3
0
def create_task():
    query_body = {
        "query": {
            "bool":{
                "must":[
                    {"term": {"finish": "0"}}
                ]
            }
        },
        "size": 10000
    }

    es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
            body=query_body)["hits"]["hits"]

    task_list = []
    if int(RUN_TYPE) == 1:
        current_ts = datehour2ts(ts2datehour(time.time()))
    else:
        current_ts = 1482681600 + 18*3600
    for item in es_results:
        tmp = []
        task_detail = item["_source"]
        task_name = task_detail['pinyin_task_name']
        update_time = task_detail["update_time"]
        sti_during = task_detail["stimulation_during"]
        stop_time =  task_detail["stop_time"]
        if RUN_TYPE == 1:
            if stop_time > current_ts:
                es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\
                    id=task_name, body={"doc":{"finish": "1"}})
        tmp.append(task_name)
        tmp.append(task_detail["stop_time"])
        tmp.append(task_detail["scan_text_finish"])
        tmp.append(current_ts)
        if current_ts - update_time >= sti_during:
            r_stimulation.lpush(task_stimulation, json.dumps(tmp))

        # update: processing status
            es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\
                id=task_name, body={"doc":{"stimulation_processing_status":"1"}})


    return True
コード例 #4
0
ファイル: task_list.py プロジェクト: lxueqian/GroupCode
def create_task_list():
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE)
    else:
        now_ts = datehour2ts(ts2datehour(time.time() - 3600))

    print_log = " ".join([file_path, "--start:"])
    print print_log

    query_body = {"query": {"match_all": {}}}

    search_results = es.search(index=index_manage_sensing,
                               doc_type=type_manage_sensing,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']

            task = []
            task.append(item['task_name'])  # task_name
            try:
                task.append(json.loads(
                    item['social_sensors']))  # social sensors
            except:
                task.append(item['social_sensors'])  # social sensors
            task.append(now_ts)

            r.lpush("task_name", json.dumps(task))
            count += 1

    print 'task_count_sum:', count
コード例 #5
0
ファイル: dispose_data.py プロジェクト: yuanhuiru/xnr2
def dispose_data(task_name, current_ts):
    es_result = es_prediction.get(index=index_manage_prediction_task,
                                  doc_type=type_manage_prediction_task,
                                  id=task_name)["_source"]
    macro_during = es_result['macro_during']
    start_ts = datehour2ts(ts2datehour(es_result["submit_time"]))
    task_start_ts = start_ts
    end_ts = datehour2ts(ts2datehour(es_result["stop_time"]))

    index_micro = "micro_prediction_" + task_name
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "range": {
                        "update_time": {
                            "lte": current_ts
                        }
                    }
                }
            }
        },
        "size": 10000,
        "sort": {
            "update_time": {
                "order": "asc"
            }
        }
    }
    micro_results = es_prediction.search(index=index_micro,
                                         doc_type="micro_task",
                                         body=query_body)["hits"]["hits"]
    total_list = []

    for item in micro_results:
        total_list.append(item["_source"]["total_count"])
    # 每个时间段内的微博量

    total_len = (end_ts - start_ts) / macro_during
    times = int(macro_during) / 3600
    lenth = len(total_list) / times
    adjust_list = []
    time_list = []
    count = 0
    i = 0
    for item in total_list:
        count += item
        i += 1
        if i % times == 0:
            if start_ts <= current_ts:
                adjust_list.append(count)
                count = 0
                time_list.append(start_ts)
            else:
                break
        start_ts += 3600

    # 总得时间走势图
    total_time_list = []
    for i in range(total_len):
        total_time_list.append(task_start_ts + i * macro_during)

    left_time = list(set(total_time_list) - set(time_list))
    left_time = sorted(left_time)

    return adjust_list, total_len, time_list, left_time
コード例 #6
0
def trendline_list(mid, total_value, mid_ts):
    if RUN_TYPE:
        ts = time.time()
    else:
        ts = datetime2ts("2016-11-20")
    """
    index_list = []
    for i in range(diffusion_time):
        index_list.append("flow_text_"+ts2datetime(ts-i*24*3600))

    result = dict()
    for iter_index in index_list:
        if not es.indices.exists(index=iter_index):
            continue
        try:
            result = es.get(index=iter_index, doc_type="text", id=mid)["_source"]
            break
        except:
            pass

    if not result:
        return []
    """

    nn = 24 * 3600 / diffusion_time_interval  ###
    current_list = []
    rising_list = []
    falling_list = []
    exist_time_list = []
    total_time_list = []

    timestamp = mid_ts
    start_ts = mid_ts
    timestamp = datehour2ts(ts2datehour(timestamp))
    for i in range(diffusion_time * nn):
        total_time_list.append(timestamp + i * diffusion_time_interval)

    # diffusion more than 5 days, return time list as far
    if 1:
        while 1:
            query_body = {
                "query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "root_mid": mid
                            }
                        }, {
                            "range": {
                                "timestamp": {
                                    "gte": timestamp,
                                    "lt": timestamp + diffusion_time_interval
                                }
                            }
                        }]
                    }
                }
            }
            index_name = "flow_text_" + ts2datetime(timestamp)
            count = es.count(index=index_name,
                             doc_type="text",
                             body=query_body)["count"]
            current_list.append(count)
            exist_time_list.append(timestamp)
            timestamp += diffusion_time_interval
            if timestamp >= ts:
                break

    left_set = set(total_time_list) - set(exist_time_list)
    left_list = sorted(list(left_set), reverse=False)

    max_value = max(current_list)
    index_exist = len(current_list)
    value = current_list

    expected_value = total_value * 0.8 / (0.2 * nn * diffusion_time)
    if expected_value <= max_value:
        top_value = (max_value + total_value) / 2
    else:
        top_value = expected_value

    # weibo prediction
    k = 5
    h = 0.5
    peak = spd(value, h, k)
    flag = judge(peak, value)
    if len(flag) == 2:
        paras = getTwoBeauties(value, flag[0], flag[1])
        paras[-1] = diffusion_time * nn
        series = bassTwoPeaks(paras)
    else:
        paras = getSingleBeauty(value)
        paras[-1] = diffusion_time * nn
        series = bassOnePeak(paras)

    # 预测峰值位置
    predict_climax = series.index(max(series))

    if predict_climax > index_exist:
        predict_climax_left = predict_climax - len(current_list)
        rise_trend, fall_trend = get_trend(left_list, predict_climax_left,
                                           value[-1], top_value)
        true_climax = exist_time_list[0] + (
            exist_time_list[1] - exist_time_list[0]) * predict_climax
    else:
        top_value = value[-1]
        rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1)
        true_climax = exist_time_list[value.index(max(value))]
        top_value = max(value)

    results = dict()
    results["climax"] = [true_climax, top_value]
    results["rise_trend"] = rise_trend
    results["fall_trend"] = fall_trend
    new_list = []
    for i in range(len(exist_time_list)):
        new_list.append([exist_time_list[i], value[i]])
    results["exist_trend"] = new_list

    return results
コード例 #7
0
ファイル: cron_compute_hot.py プロジェクト: zhhhzhang/xnr1
def compute_recommend_subopnion(task_detail):

    print '开始分析计算......'

    task_id = task_detail['task_id'].strip('"')

    keywords_string = task_detail['keywords_string']

    keywords_list = keywords_string.split('&')  ## 以 & 切分关键词,得到list

    xnr_user_no = task_detail['xnr_user_no']
    mid = task_detail['mid']

    query_item = 'keywords_string'
    nest_query_list = []
    for keyword in keywords_list:
        nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}})
    '''
    ## 重点关注当前虚拟人的关注用户
    if S_TYPE == 'test':
        # followers_list = get_result['followers_list']
        # nest_query_list.append({'terms':followers_list})
        print '全部用户'
    else:
        get_result = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
        id=xnr_user_no)['_source']
        followers_list = get_result['followers_list']
        nest_query_list.append({'terms':followers_list})
    '''

    if S_TYPE == 'test':
        create_time = datetime2ts(S_DATE)
    else:
        create_time = datehour2ts(ts2datehour(time.time() - 3600))

    #get_flow_text_index_list(create_time)

    #index_name_list_list = get_flow_text_index_list(now_timestamp)
    index_name_list = get_flow_text_index_list(create_time)
    print 'index_name_list::', index_name_list
    es_results = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits']

    weibo_list = []  ## 内容推荐和子观点分析的输入

    if es_results:
        for item in es_results:
            item = item['_source']
            weibo = item['text']
            weibo_list.append(weibo)

    ## 内容推荐

    ## 得到推荐句子列表
    print 'weibo_list::::::', weibo_list
    print '开始内容推荐计算......'
    if weibo_list:
        content_results = summary_main(weibo_list)
    else:
        content_results = []

    print '开始保存内容推荐计算结果......'

    mark = save_content_recommendation_results(xnr_user_no, mid,
                                               task_id.encode('utf-8'),
                                               content_results)
    print 'mark_content:::', mark
    if mark == False:
        print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中'
        add_task_2_queue(keyword_task_queue_name, task_detail)
    else:
        print '内容推荐计算结果保存完毕......'

    ## 子观点分析
    '''
    输入:
    weibo_data:微博列表,[weibo1,weibo2,...]
    k_cluster:子话题个数 (默认为5)
    输出:
    opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
    word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
    text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''

    print '开始子观点计算......'
    if weibo_list:
        opinion_name, word_result, text_list = opinion_main(weibo_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

    else:
        sub_opinion_results = {}

    print '开始保存子观点计算结果......'
    mark = save_subopnion_results(xnr_user_no, mid, task_id,
                                  sub_opinion_results)
    print 'mark_opinion:::', mark
    if mark == False:

        print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中'

        add_task_2_queue(keyword_task_queue_name, task_detail)

    else:
        print '子观点计算结果保存完毕......'
コード例 #8
0
        source = item["news_source"]  # 新闻来源
        date = item["news_date"].strip()  # 日期
        #print url, title, content, source, date
        count += 1
        print count
    except:
        continue

    # 建索引的代码从这里开始写
    index_dict = dict()
    index_dict["url"] = url
    index_dict["title"] = title
    index_dict["content"] = content
    index_dict["source"] = source
    try:
        index_dict["timestamp"] = datehour2ts(date)
    except:
        index_dict["timestamp"] = datetime2ts(date)
    bulk_action.extend([{"index": {"_id": url}}, index_dict])
    index_count += 1

    if index_count != 0 and index_count % 100 == 0:
        es.bulk(bulk_action, index="news", doc_type="text")
        bulk_action = []
        print "finish index: ", index_count

if bulk_action:
    es.bulk(bulk_action, index="news", doc_type="text")
print "total index: ", index_count

f.close()
コード例 #9
0
def task_list():
    create_task()
    if RUN_TYPE:
        current_ts = datehour2ts(ts2datehour(time.time()))
    else:
        current_ts = 1482861600
    while 1:
        task_detail = r_trendline.rpop(task_trendline)
        print task_detail
        if not task_detail:
            break

        task_name = task_detail
        while 1:
            micro_index = "micro_prediction_" + task_name
            es_exist = es_prediction.exists(index=micro_index,
                                            doc_type="micro_task",
                                            id=current_ts)
            if not es_exist:
                time.sleep(60)
            else:
                break

        # obtain time series
        value, total_len, time_list, left_list = dispose_data(
            task_name, current_ts)

        # macro prediction result
        try:
            es_macro_result = es_prediction.get(index=index_macro_feature_result,\
                doc_type=type_macro_feature_result,id=task_name)["_source"]
            prediction_total_value = es_macro_result["predict_weibo_value"]
            top_value = prediction_total_value * 0.8 / (0.2 * total_len)
        except:
            top_value = 0

        # 已知的最大值和位置
        max_exist = max(value)
        index_exist = len(value)

        if top_value < max_exist:
            top_value = 2 * max_exist

        # weibo prediction
        k = 5
        h = 0.5
        peak = spd(value, h, k)
        flag = judge(peak, value)
        if len(flag) == 2:
            print("Two peaks:")
            paras = getTwoBeauties(value, flag[0], flag[1])
            paras[-1] = total_len
            series = bassTwoPeaks(paras)
        else:
            print("Single peak:")
            paras = getSingleBeauty(value)
            paras[-1] = total_len
            series = bassOnePeak(paras)

        # 预测峰值位置
        predict_climax = series.index(max(series))

        if predict_climax > index_exist:
            predict_climax_left = predict_climax - len(value)
            # 剩余走势图 climax位置 起止点值 最大值
            rise_trend, fall_trend = get_trend(left_list, predict_climax_left,
                                               value[-1], top_value)
            true_climax = time_list[0] + (time_list[1] -
                                          time_list[0]) * predict_climax
        else:
            top_value = value[-1]
            rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1)
            true_climax = time_list[value.index(max(value))]

        results = dict()
        results["climax"] = [true_climax, top_value]
        results["rise_trend"] = rise_trend
        results["fall_trend"] = fall_trend
        new_list = []
        for i in range(len(time_list)):
            new_list.append([time_list[i], value[i]])
        results["exist_trend"] = new_list
        r_trendline.set("trendline_" + task_name, json.dumps(results))
        print results