Esempio n. 1
0
def func_write_into_weekly_index_new_released(line_list, doc_type, index='short-video-weekly'):
    count = 0
    bulk_all_body = ''
    re_list = []
    for line in line_list:
        count = count + 1
        weekly_net_inc_play_count = line['play_count']
        weekly_net_inc_comment_count = line['comment_count']
        weekly_net_inc_favorite_count = line['favorite_count']
        try:
            weekly_net_inc_repost_count = line['repost_count']
        except:
            weekly_net_inc_repost_count = 0
        weekly_cal_base = 'accumulate'
        timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)

        line.update({
            'timestamp': timestamp,
            'weekly_cal_base': weekly_cal_base,
            'weekly_net_inc_favorite_count': weekly_net_inc_favorite_count,
            'weekly_net_inc_comment_count': weekly_net_inc_comment_count,
            'weekly_net_inc_play_count': weekly_net_inc_play_count,
            'weekly_net_inc_repost_count': weekly_net_inc_repost_count,
        })
        re_list.append(line)

        url = line['url']

        platform = line['platform']
        print(platform)
        doc_id = cal_doc_id(platform, url=url, doc_id_type='all-time-url',data_dict=line)
        print(doc_id)
        bulk_head = '{"index": {"_id":"%s"}}' % doc_id
        data_str = json.dumps(line, ensure_ascii=False)

        bulk_one_body = bulk_head + '\n' + data_str + '\n'
        #
        bulk_all_body += bulk_one_body
        if count % 500 == 0:

            eror_dic = es.bulk(index=index, doc_type=doc_type,
                               body=bulk_all_body, request_timeout=200)
            bulk_all_body = ''
            if eror_dic['errors'] is True:
                print(eror_dic['items'])
                print(bulk_all_body)
            print(count)

    if bulk_all_body != '':
        eror_dic = es.bulk(body=bulk_all_body,
                           index=index,
                           doc_type=doc_type,
                           request_timeout=200)
        if eror_dic['errors'] is True:
            print(eror_dic)
def func_write_into_alltime_index_new_released(line_list,
                                               doc_type,
                                               index='short-video-all-time-url'
                                               ):
    count = 0
    bulk_all_body = ''
    re_list = []
    for line in line_list:
        count = count + 1
        re_list.append(line)
        # if 'video_id' in line.keys():
        #     line.pop('video_id')
        url = line['url']

        platform = line['platform']
        #print(platform)
        # if platform == "腾讯新闻":
        #     line.pop("data_source")
        #     line["releaserUrl"] = line["playcnt_url"]
        doc_id = cal_doc_id(platform,
                            url=url,
                            doc_id_type='all-time-url',
                            data_dict=line)
        # print(doc_id)
        bulk_head = '{"index": {"_id":"%s"}}' % doc_id
        data_str = json.dumps(line, ensure_ascii=False)

        bulk_one_body = bulk_head + '\n' + data_str + '\n'
        #
        bulk_all_body += bulk_one_body
        if count % 1000 == 0:

            eror_dic = es.bulk(index=index,
                               doc_type=doc_type,
                               body=bulk_all_body,
                               request_timeout=200)
            bulk_all_body = ''
            if eror_dic['errors'] is True:
                print(eror_dic['items'])
                print(bulk_all_body)
        print(count)

    if bulk_all_body != '':
        eror_dic = es.bulk(body=bulk_all_body,
                           index=index,
                           doc_type=doc_type,
                           request_timeout=200)
        if eror_dic['errors'] is True:
            print(eror_dic)

        print(count)
        # releaserUrl = func_search_reUrl_from_target_index(platform, releaser)
        releaserUrl = 1
        if releaserUrl != None:
            re_list = []
            search_body = {
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"platform.keyword": platform}}, {"term": {"releaser.keyword": releaser}},
                            {"range": {"release_time": {"gte": 1546272000000, "lt": 1554048000000}}},
                            {"range": {"fetch_time": {"gte": 1556150400000}}}
                        ]
                    }
                }
            }

            scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',
                           query=search_body, scroll='3m')
            for one_scan in scan_re:
                "发布者,平台,标题,url,播放量,点赞量,评论量,时长,发布时间"
                data_dic[cal_doc_id(platform, url=one_scan["_source"]["url"], doc_id_type='all-time-url')]=[one_scan["_source"]["releaser"],one_scan["_source"]["platform"],one_scan["_source"]["title"],one_scan["_source"]["url"],one_scan["_source"]["play_count"],one_scan["_source"]["favorite_count"],one_scan["_source"]["comment_count"],one_scan["_source"]["duration"],datetime.datetime.fromtimestamp(one_scan["_source"]["release_time"]/1000).strftime('%Y-%m-%d %H:%M:%S')]
data_lis = []
print(len(data_dic))
for d in data_dic:
    data_lis.append(data_dic[d])

data = pd.DataFrame(data_lis)
data.to_csv('./%s.csv' % "无锡台内容数据需求2", encoding="ansi")


Esempio n. 4
0
                    "bool": {
                        "filter": [
                            {"term": {"platform.keyword": platform}},
                            # {"term": {"releaser.keyword": releaser}},
                                {"term": {"releaser_id_str": doc_id}},
                            {"range": {"release_time": {"gte": re_s_t, "lt": re_e_t}}},
                          {"range": {"fetch_time": {"gte": re_s_t}}}
                        ]
                    }
                }
            }

#             #scan_re = scan(client=es, index='crawler-data-raw', doc_type='doc',query=search_body, scroll='3m')
            scan_re = scan(client=es, index='short-video-all-time-url', doc_type='all-time-url',query=search_body, scroll='3m')
            for one_scan in scan_re:
                doc_id = cal_doc_id(one_scan["_source"]["platform"], url=one_scan["_source"]["url"], doc_id_type='all-time-url', data_dict=one_scan["_source"])
                find_exist = {
                    "query": {
                        "bool": {
                            "filter": [
                                {"term": {"_id": doc_id}}
                            ]
                        }
                    }
                }
                search_re = es.search(index='short-video-weekly', doc_type=weekly_doc_type_name,
                                      body=find_exist)
                if search_re['hits']['total'] == 0:
                    re_list.append(one_scan['_source'])
                else:
                    count_has += 1