def init(): for i in range(20): ascp = public.get_ASCP() form = { 'ac': 'wap', 'format': 'json_raw', 'min_behot_time': int(time.time()), 'as': ascp[0], 'enable_stick': 'false', 'tag': 'video', 'cp': ascp[1], } for item in public.request_data(LIST_API, cookie=COOKIE, header=HEADER, params=form).get('data', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '今日头条' video['pic'] = item.get('large_image_url') r, s = public.get_rs(item.get('video_id')) video['url'] = base64.b64decode( public.request_data( DETAIL_API + item.get('video_id'), params={ 'r': r, 's': s }, header=HEADER).get('data').get('video_list'). get('video_1').get('main_url').encode('utf-8')).decode('utf-8') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def scrapy_list(type_info): ascp = public.get_ASCP() form = { 'ac': 'wap', 'format': 'json_raw', 'min_behot_time': int(time.time()), 'as': ascp[0], 'enable_stick': 'false', 'tag': type_info.get('tag'), 'cp': ascp[1], } for item in public.request_data(LIST_API, cookie=COOKIE, header=HEADER, params=form).get('data', []): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': list(map(lambda x: x.get('url'), item.get('image_list')))[:3], 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '今日头条', 'detail': scrapy_detail(item.get('source_url')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def scrapy_detail(id): try: data = json.loads( public.request_data(DETAIL_API + '?id=' + id, header=HEADER).get('data').get('content', [])) detail = [] for item in data: content = {} if item.get('type') == 'cnt_article': content['type'] = 'text' content['data'] = item.get('desc') elif item.get('type') == 'img_url': content['type'] = 'image' if item.get('img'): content['data'] = item.get('img').get('imgurl1000').get( 'imgurl') if not content['data']: content['data'] = item.get('img').get('imgurl640').get( 'imgurl') if not content['data']: content['data'] = item.get('img').get('imgurl0').get( 'imgurl') elif item.get('img_url'): content['data'] = item.get('img_url') detail.append(content) return detail except Exception as e: print(e, id)
def scrapy_list(cookie, category__name, timesample): list_postdata = { 'from': 'news_webapp', 'pd': 'webapp', 'os': 'android', 'mid': cookie['BAIDUID'], 'ver': 6, 'category_name': category__name, 'action': 0, 'display_time': timesample, 'wf': 0, } data = public.request_data(LISTAPI, params=list_postdata, cookie=cookie).get('data').get('news', []) for item in data: save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': list( map(lambda x: x.get('url_webp', ''), item.get('imageurls', []))), 'tag': category__name, 'timestamp': int(time.time()), 'from': '百度新闻', 'detail': scrapy_detail(item.get('nid'), category__name) } if save_data.get('detail') and len(save_data.get('detail')) > 1: public.save_data(save_data, DB_NAME)
def scrapy_list(nextUrl, type_info): if nextUrl: data = public.request_data(nextUrl) else: data = public.request_data(LIST_API + type_info['tag']) for item in data.get('contList', []): save_data = { '_id': public.format_id(item.get('name')), 'title': item.get('name'), 'imageurls': [item.get('pic')], 'tag': type_info['name'], 'timestamp': int(time.time()), 'from': '澎湃新闻', 'detail': scrapy_detail(item['contId']) } if save_data.get('detail'): public.save_data(save_data, DB_NAME) return data.get('nextUrl')
def init(): for channel in CHANNELS: for i in range(20): for item in public.request_data(LIST_API, params={ 'channel': channel, 'page': i }).get('videos', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '百度视频' video['pic'] = item.get('imgh_url') video['url'] = public.request_data( DETAIL_API, params={ 'id': item.get('url').split('/')[-1].split('.')[0] }).get('data').get('main_video').get('source').get('mp4') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for i in range(20): form = { 'cateid': 'uM', 'mod': 'mpvideo', 'action': 1, 'up': i, 'down': 0, 'did': '41ea3bde383860b6', 'imei': '', 'length': 13, 'net_type': 2, 'ad': { "originfrom": "huawei-q", "imei": "868384032192527", "channel": "news_ent", "osVersion": "9", "deviceModel": "HWI-AL00", "platform": "android", "from": "fastapp" }, 'app_type': 124, 'cre': 'tianyi', 'merge': 3, 'statics': 1, 'ldid': '', 'uid': '' } for item in public.request_data(LIST_API, params=form).get('data', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '新浪视频' video['pic'] = item.get('thumb') video['url'] = public.request_data( DETAIL_API, params={ 'docUrl': item.get('surl') }).get('data').get('videosModule')[0].get('data')[0].get( 'videoInfo').get('url') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for lc in range(20): for item in public.request_data(API + str(lc)).get('data').get( 'videoList', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '搜狐视频' video['pic'] = item.get('tvPic') video['url'] = item.get('playUrl') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for i in range(20): form = {'rtype': 0, 'pageNum': i, 'pageSize': 10} for item in public.request_data(LIST_API, params=form, header=HEADER, method='POST').get('page').get( 'result', []): video = {} video['_id'] = public.format_id(item.get('title')) video['title'] = item.get('title') video['from'] = '看鉴视频' video['pic'] = item.get('image') video['url'] = public.request_data( DETAIL_API, params={ 'videoId': item['rid'] }, header=HEADER, method='POST').get('upyunVideos')[0].get('playurl') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): for channel_id in CHANNEL_IDS: for i in range(20): form = { 'app': 'howto_a', 'udid': '41ea3bde383860b6', 'channel_id': channel_id, 'timestamp': int(time.time()) } for item in public.request_data(LIST_API, params=form).get('contents', []): video = {} video['_id'] = public.format_id(item.get('video').get('name')) video['title'] = item.get('video').get('name') video['from'] = '好兔视频' video['pic'] = item.get('video').get('share_img') video['url'] = public.request_data( DETAIL_API, params={ 'id': item.get('id') }).get('bitrates')[-1].get('uri') video['timestamp'] = int(time.time()) public.save_data(video, DB_NAME)
def init(): global API for i in range(20): res = public.request_data(API) for item in res.get('contList', []): video = {} video['_id'] = public.format_id(item.get('name')) video['title'] = item.get('name') video['from'] = '梨视频' video['pic'] = item.get('pic') video['url'] = item.get('videos')[0].get('url') video['timestamp'] = int(time.time()) API = res.get('nextUrl') public.save_data(video, DB_NAME)
def scrapy_detail(nid, tag): try: detail_postdata = { 'cuid': '', 'nids': nid, 'wf': 1, 'remote_device_type': 1, 'os_type': 1, 'screen_size_width': 1080, 'screen_size_height': 1920, } data = public.request_data( DETAILAPI, params=detail_postdata).get('data').get('news')[0] return list(map(format_detail, data.get('content'))) except Exception as e: print(e, 1)
def scrapy_list(type_info, count): form = {'channel': type_info.get('tag'), 'page': count} data = public.request_data(LIST_API, params=form, header=HEADER).get('data', []) if data: for item in data: save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': item.get('img_urls'), 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '腾讯新闻', 'detail': scrapy_detail(item.get('id')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def init(): global LIST_API for i in range(20): res = public.request_data(LIST_API, params=FORM) for item in res.get('contList', []): video = {} video['_id'] = public.format_id(item.get('name')) video['title'] = item.get('name') video['from'] = '澎湃视频' video['pic'] = item.get('pic') video['url'] = etree.HTML( public.request_html(DETAIL_API, params={ 'contid': item['contId'] })).xpath("//*[@class='m']/@href")[0] video['timestamp'] = int(time.time()) LIST_API = res.get('nextUrl') public.save_data(video, DB_NAME)
def scrapy_detail(aid): try: form = { 'st': '', 'df': 'androidphone', 'loginid': '', 'os': 'android', 'city': '', 'screen': '1080x1794', 'nw': 'wifi', 'deviceid': '', 'publishid': '5286', 'gv': '5.7.2', 'uid': '868384032192527', 'province': '', 'av': '5.7.2', 'proid': 'ifengnews', 'district': '', 'limit': 5, 'from': 'xiaomi', 'sn': '', 'aid': aid, 'vt': 5, } res = public.request_data(DETAIL_API, params=form).get('body').get('text') data = etree.HTML(res) detail = [] for item in data.xpath('//*'): if item.xpath('text()'): content = {} content['type'] = 'text' content['data'] = item.xpath('text()')[0] detail.append(content) if item.xpath('img/@src'): content = {} content['type'] = 'image' content['data'] = item.xpath('img/@src')[0] detail.append(content) return detail except Exception as e: print(e)
def scrapy_list(type_info, count): form = { 'cateid': type_info['cateid'], 'mod': type_info['mod'], 'action': 2, 'up': count, 'down': 0, 'did': '41ea3bde383860b6', 'imei': '', 'length': 13, 'net_type': 2, 'ad': { "originfrom": "huawei-q", "imei": "868384032192527", "channel": "news_ent", "osVersion": "9", "deviceModel": "HWI-AL00", "platform": "android", "from": "fastapp" }, 'app_type': 124, 'cre': 'tianyi', 'merge': 3, 'statics': 1, 'ldid': '', 'uid': '' } for item in public.request_data(LIST_API, params=form).get('data', []): if item.get('thumbs'): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': item.get('thumbs')[:3], 'tag': type_info.get('category__name'), 'timestamp': int(time.time()), 'from': '新浪新闻', 'detail': scrapy_detail(item.get('url')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def scrapy_list(type_info, count): form = { 'st': '', 'df': '', 'pullNum': count, 'loginid': '', 'os': 'android', 'city': '', 'screen': '', 'nw': '', 'deviceid': '', 'gv': '5.7.3', 'publishid': '', 'uid': '868384032192527', 'lastDoc': '', 'province': '', 'av': 0, 'district': '', 'proid': 'ifengnews', 'action': 'default', 'id': type_info.get('tag'), 'sn': '', 'vt': 5, } data = public.request_data(LIST_API, params=form)[0] if data: for item in data.get('item'): if item.get('thumbnail') and item.get('id'): save_data = { '_id': public.format_id(item.get('title')), 'title': item.get('title'), 'imageurls': [item.get('thumbnail')], 'tag': type_info.get('name'), 'timestamp': int(time.time()), 'from': '凤凰新闻', 'detail': scrapy_detail(item.get('id')) } if save_data.get('detail'): public.save_data(save_data, DB_NAME)
def scrapy_detail(source_url): try: res = public.request_data(DETAIL_API + source_url + 'info/', cookie=COOKIE, header=HEADER).get('data').get('content') data = etree.HTML(res) detail = [] for item in data.xpath('//*/div[1]/*'): if item.xpath('text()'): content = {} content['type'] = 'text' content['data'] = item.xpath('text()')[0] detail.append(content) if item.xpath('img/@src'): content = {} content['type'] = 'image' content['data'] = item.xpath('img/@src')[0] detail.append(content) return detail except Exception as e: print(e)
def scrapy_detail(doc_url): try: data = public.request_data(DETAIL_API + doc_url).get('data') if data.get('pics'): pics = list( map(lambda x: x.get('data').get('pic'), data.get('pics'))) text = re.split(r'<!--{IMG_\d}-->', data.get('content').replace('<br/>', '')) i = 0 content = [] for item in text: content.append({'type': 'text', 'data': item}) if i >= 1 and i <= len(text) - 2: content.append({'type': 'image', 'data': pics[i]}) i += 1 return content elif data.get('picsModule'): content = [] for item in data.get('picsModule')[0].get('data'): content.append({'type': 'text', 'data': item.get('alt')}) content.append({'type': 'image', 'data': item.get('pic')}) return content except Exception as e: print(e)