def scrapy_list(cookie, category__name, timesample): list_postdata = { 'from': 'news_webapp', 'pd': 'webapp', 'os': 'android', 'mid': cookie['BAIDUID'], 'ver': 6, 'category_name': category__name, 'action': 0, 'display_time': timesample, 'wf': 0, } data = public.request_data(LISTAPI, params=list_postdata, cookie=cookie).get('data').get('news', []) for item in data: save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': list( map(lambda x: x.get('url_webp', ''), item.get('imageurls', []))), 'tag': category__name, 'timestamp': int(time.time()), 'from': '百度新闻', 'detail': scrapy_detail(item.get('nid'), category__name) } if save_data.get('detail') and len(save_data.get('detail')) > 1: public.save_data(save_data, DB_NAME)
def scrapy_list(type_info): ascp = public.get_ASCP() form = { 'ac': 'wap', 'format': 'json_raw', 'min_behot_time': int(time.time()), 'as': ascp[0], 'enable_stick': 'false', 'tag': type_info.get('tag'), 'cp': ascp[1], } for item in public.request_data(LIST_API, cookie=COOKIE, header=HEADER, params=form).get('data', []): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': list(map(lambda x: x.get('url'), item.get('image_list')))[:3], 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '今日头条', 'detail': scrapy_detail(item.get('source_url')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def init(): for i in range(20): ascp = public.get_ASCP() form = { 'ac': 'wap', 'format': 'json_raw', 'min_behot_time': int(time.time()), 'as': ascp[0], 'enable_stick': 'false', 'tag': 'video', 'cp': ascp[1], } for item in public.request_data(LIST_API, cookie=COOKIE, header=HEADER, params=form).get('data', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '今日头条' video['pic'] = item.get('large_image_url') r, s = public.get_rs(item.get('video_id')) video['url'] = base64.b64decode( public.request_data( DETAIL_API + item.get('video_id'), params={ 'r': r, 's': s }, header=HEADER).get('data').get('video_list'). get('video_1').get('main_url').encode('utf-8')).decode('utf-8') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for lc in range(20): for item in public.request_data(API + str(lc)).get('data').get( 'videoList', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '搜狐视频' video['pic'] = item.get('tvPic') video['url'] = item.get('playUrl') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): global API for i in range(20): res = public.request_data(API) for item in res.get('contList', []): video = {} video['_id'] = public.format_id(item.get('name')) video['title'] = item.get('name') video['from'] = '梨视频' video['pic'] = item.get('pic') video['url'] = item.get('videos')[0].get('url') video['timestamp'] = int(time.time()) API = res.get('nextUrl') public.save_data(video, DB_NAME)
def scrapy_list(type_info, count): form = {'channel': type_info.get('tag'), 'page': count} data = public.request_data(LIST_API, params=form, header=HEADER).get('data', []) if data: for item in data: save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': item.get('img_urls'), 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '腾讯新闻', 'detail': scrapy_detail(item.get('id')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def scrapy_list(nextUrl, type_info): if nextUrl: data = public.request_data(nextUrl) else: data = public.request_data(LIST_API + type_info['tag']) for item in data.get('contList', []): save_data = { '_id': public.format_id(item.get('name')), 'title': item.get('name'), 'imageurls': [item.get('pic')], 'tag': type_info['name'], 'timestamp': int(time.time()), 'from': '澎湃新闻', 'detail': scrapy_detail(item['contId']) } if save_data.get('detail'): public.save_data(save_data, DB_NAME) return data.get('nextUrl')
def init(): global LIST_API for i in range(20): res = public.request_data(LIST_API, params=FORM) for item in res.get('contList', []): video = {} video['_id'] = public.format_id(item.get('name')) video['title'] = item.get('name') video['from'] = '澎湃视频' video['pic'] = item.get('pic') video['url'] = etree.HTML( public.request_html(DETAIL_API, params={ 'contid': item['contId'] })).xpath("//*[@class='m']/@href")[0] video['timestamp'] = int(time.time()) LIST_API = res.get('nextUrl') public.save_data(video, DB_NAME)
def init(): for channel in CHANNELS: for i in range(20): for item in public.request_data(LIST_API, params={ 'channel': channel, 'page': i }).get('videos', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '百度视频' video['pic'] = item.get('imgh_url') video['url'] = public.request_data( DETAIL_API, params={ 'id': item.get('url').split('/')[-1].split('.')[0] }).get('data').get('main_video').get('source').get('mp4') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for i in range(20): form = { 'cateid': 'uM', 'mod': 'mpvideo', 'action': 1, 'up': i, 'down': 0, 'did': '41ea3bde383860b6', 'imei': '', 'length': 13, 'net_type': 2, 'ad': { "originfrom": "huawei-q", "imei": "868384032192527", "channel": "news_ent", "osVersion": "9", "deviceModel": "HWI-AL00", "platform": "android", "from": "fastapp" }, 'app_type': 124, 'cre': 'tianyi', 'merge': 3, 'statics': 1, 'ldid': '', 'uid': '' } for item in public.request_data(LIST_API, params=form).get('data', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '新浪视频' video['pic'] = item.get('thumb') video['url'] = public.request_data( DETAIL_API, params={ 'docUrl': item.get('surl') }).get('data').get('videosModule')[0].get('data')[0].get( 'videoInfo').get('url') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def scrapy_list(type_info, count): form = { 'cateid': type_info['cateid'], 'mod': type_info['mod'], 'action': 2, 'up': count, 'down': 0, 'did': '41ea3bde383860b6', 'imei': '', 'length': 13, 'net_type': 2, 'ad': { "originfrom": "huawei-q", "imei": "868384032192527", "channel": "news_ent", "osVersion": "9", "deviceModel": "HWI-AL00", "platform": "android", "from": "fastapp" }, 'app_type': 124, 'cre': 'tianyi', 'merge': 3, 'statics': 1, 'ldid': '', 'uid': '' } for item in public.request_data(LIST_API, params=form).get('data', []): if item.get('thumbs'): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': item.get('thumbs')[:3], 'tag': type_info.get('category__name'), 'timestamp': int(time.time()), 'from': '新浪新闻', 'detail': scrapy_detail(item.get('url')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def init(): for channel_id in CHANNEL_IDS: for i in range(20): form = { 'app': 'howto_a', 'udid': '41ea3bde383860b6', 'channel_id': channel_id, 'timestamp': int(time.time()) } for item in public.request_data(LIST_API, params=form).get('contents', []): video = {} video['_id'] = public.format_id(item.get('video').get('name')) video['title'] = item.get('video').get('name') video['from'] = '好兔视频' video['pic'] = item.get('video').get('share_img') video['url'] = public.request_data( DETAIL_API, params={ 'id': item.get('id') }).get('bitrates')[-1].get('uri') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def scrapy_list(type_info, count): form = { 'st': '', 'df': '', 'pullNum': count, 'loginid': '', 'os': 'android', 'city': '', 'screen': '', 'nw': '', 'deviceid': '', 'gv': '5.7.3', 'publishid': '', 'uid': '868384032192527', 'lastDoc': '', 'province': '', 'av': 0, 'district': '', 'proid': 'ifengnews', 'action': 'default', 'id': type_info.get('tag'), 'sn': '', 'vt': 5, } data = public.request_data(LIST_API, params=form)[0] if data: for item in data.get('item'): if item.get('thumbnail') and item.get('id'): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': [item.get('thumbnail')], 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '凤凰新闻', 'detail': scrapy_detail(item.get('id')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def init(): for i in range(20): form = {'rtype': 0, 'pageNum': i, 'pageSize': 10} for item in public.request_data(LIST_API, params=form, header=HEADER, method='POST').get('page').get( 'result', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '看鉴视频' video['pic'] = item.get('image') video['url'] = public.request_data( DETAIL_API, params={ 'videoId': item['rid'] }, header=HEADER, method='POST').get('upyunVideos')[0].get('playurl') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)