def handler(cls, url): try: html = abstract_grab(url) vid_regex = 'var vid = "(.*?)";' vid = re.search(vid_regex, html, flags=re.S).group(1) app_key_regex = 'var modeServerAppKey = "(.*?)";' app_key = re.search(app_key_regex, html, flags=re.S).group(1) data = abstract_grab(cls.RESOURCE_API % (vid, app_key)) data = json.loads(data)['data'] result = HandlerOutput(video_info=HandlerOutput.VideoInfo( title=data['title'], cover=data['cover'], ), ) for item in data['resource']['progressive']: result.options.append( HandlerOutput.Option( quality=item['profile'], urls=[HandlerOutput.Url(item['url'])], )) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) return HandlerAdapter([result])
def handler(cls, url): try: html = abstract_grab(url) media_id_regex = '"media_id": (.*?),' media_id = re.search(media_id_regex, html, flags=re.S).group(1) title_regex = '"title": "(.*?)",' title = re.search(title_regex, html, flags=re.S).group(1) cover_regex = '"cover": "(.*?)",' cover = re.search(cover_regex, html, flags=re.S).group(1) data = abstract_grab(cls.RESOURCE_API % media_id) data = json.loads(data)['msg']['segs'] result = HandlerOutput(video_info=HandlerOutput.VideoInfo( title=title, cover=cover, ), ) for quality in data: o = HandlerOutput.Option(urls=[], quality=quality) result.options.append(o) for seg in data[quality]: o.urls.append( HandlerOutput.Url( url=seg['url'], index=seg['number'], )) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) return HandlerAdapter([result])
def get_item_info(sub_uri): uri = '%s%s' % (AOC_MONITOR_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') image_box = soup.find(id='box_2') images = image_box.find_all('img') image_list = [] for image in images: image_list.append(image['src']) image_list = image_list[1:] items = soup.find(id='box_3').find_all('tr') result = dict() last_key = None for item in items: try: key = get_text_in_span(item.find('th')) key = slim_str(key) last_key = key except Exception as err: # print(str(err)) key = last_key value = get_text_in_span(item.find('td')) if result.get(key): value = result[key] + '\n' + value value = slim_str(value) result[key] = value result['图片列表'] = image_list result['产品来源'] = sub_uri return result
def get_item_info(sub_uri): uri = '%s%s' % (LG_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') img_regex = 'groupModelInfo = (.*?);' img_json = re.search(img_regex, html, flags=0).group(1) images = json.loads(img_json)['basicInfo']['galleryImg'] image_list = [] for image in images: image_list.append(image['data-zoom-image']) result = dict() result['ProductName'] = slim_str( soup.find(class_='improve-info-model').text) items_list = soup.find_all(class_='specItem') for item_list in items_list: items = item_list.find_all('li') for item in items: key = item.find(class_='title').text value = item.find(class_='value').text result[key] = value result['images'] = image_list return result
def handler(cls, url): try: html = abstract_grab(url) vid_regex = ';vid=(.*?)">' vids = re.findall(vid_regex, html, flags=re.S) title_regex = 'msg_title = "(.*?)";' title = "《" + re.search(title_regex, html, flags=re.S).group(1) + "》" cover_regex = 'msg_cdn_url = "(.*?)";' cover = re.search(cover_regex, html, flags=re.S).group(1) results = [] for index, vid in enumerate(vids): result = HandlerOutput( video_info=HandlerOutput.VideoInfo( title=title + ' 文章内视频%s' % index, cover=cover, ), options=VideoQQ.get_video_link(vid), ) results.append(result) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) return HandlerAdapter(results)
def get_item_list(index, sub_category_id): uri = '%s/us/category/filter.lg?' \ 'sort=&' \ 'page=%s&' \ 'pagePosition=1&' \ 'categoryId=CT10000030&' \ 'subCategoryId=%s&' \ 'status=ACTIVE&' \ 'grouping=Y' \ % (LG_HOST, index, sub_category_id) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') result_list = [] items = soup.find_all('p', class_='model-name redot') for item in items: href = item.find('a')['href'] print(href) while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass result_list.append(item_info) return result_list
def get_item_info(sub_uri): uri = '%s%s' % (SENNHEISER_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') product_name = soup.find(class_='product-stage__headline').text images = soup.find(id='product_stage_main_slider').find_all('img') image_list = [] for image in images: image_list.append(image['data-srcset']) result = dict() result['产品名称'] = slim_str(product_name) items = soup.find_all('li', class_='definitions__list__row') for item in items: key = slim_str(item.find('dt').text) value = slim_str(item.find('dd').text) result[key] = value result['图片列表'] = image_list return result
def get_item_info(index): uri = '%s/product/%s.html' % (AOC_MONITOR_HOST, index) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') images = soup.find(id='J_Focus').find(class_='carousel-inner').find_all('img') image_list = [] for image in images: image_list.append(image['src']) table = soup.find(class_='cpBox').find('table') items = table.find_all('tr', class_='proParaRow') result = dict() result['产品ID'] = index for item in items: key = item.find(class_='proParaName').text value = item.find(class_='proParaValue').text result[key] = value result['图片列表'] = image_list return result
def get_video_link(cls, vid): definitions = ['shd', 'hd', 'sd'] options = [] for defn in definitions: data = abstract_grab(cls.VIDEO_INFO_API % (vid, defn)) data = json.loads(data[data.index('=') + 1:-1]) # print(data) qualities = dict() for item in data['fl']['fi']: qualities[item['fs']] = item for item in data['vl']['vi']: print(item) option = HandlerOutput.Option(quality=qualities[item['fs']]['cname'], urls=[]) options.append(option) url_prefix = item['ul']['ui'][0]['url'] if item['cl']['fc']: for seg in item['cl']['ci']: keyid = seg['keyid'] filename = keyid.replace('.10', '.p', 1) + '.mp4' data = abstract_grab(cls.SEG_VIDEO_API % ( vid, qualities[item['fs']]['id'], filename)) data = json.loads(data[data.index('=') + 1:-1]) option.urls.append( HandlerOutput.Url( url='%s%s?vkey=%s' % (url_prefix, filename, data['key']), index=seg['idx'] - 1, ) ) else: fn = item['fn'] fvkey = item['fvkey'] option.urls.append( HandlerOutput.Url('%s%s?vkey=%s' % (url_prefix, fn, fvkey)) ) return options
def __init__(self, url): aid = get_random_string(length=6) crt_time = datetime.datetime.now().timestamp() html = abstract_grab(url) content = Bs(html, 'html.parser').find(id='img-content') images = content.find_all('img') for index, image in enumerate(images): if image.has_attr('data-src'): key = 'alib/%s/%s/%s' % (aid, crt_time, index) qn_manager.upload_url(image['data-src'], key) image['src'] = qn_manager.get_resource_url(key) del image['data-src'] self.content = content
def get_item_info(sub_uri): uri = '%s%s' % (CRUCIAL_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') table = soup.find(id='tab-1') items = table.find_all('li') result = dict() for item in items: item_str = item.text split = item_str.find(':') result[item_str[:split]] = item_str[split + 1:] return result
def get_item_info(item_id): uri = '%s/product_show.aspx?mid=102&id=%s' % (COLORFUL_HOST, item_id) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') table = soup.find(class_='table-bordered') items = table.find_all('tr') result = dict() for item in items: tds = item.find_all('td') result[tds[0].text] = tds[1].text images = soup.find_all(class_='slide') image_list = [] for image in images: image_list.append(COLORFUL_HOST + image['data-thumb']) result['图片列表'] = image_list return result
def handler(cls, url): try: html = abstract_grab(url) video_regex = '<video src="(.*?)".*?poster="(.*?)"' videos = re.findall(video_regex, html, flags=re.S) title_regex = 'name="keywords" content="(.*?)"' title = "《" + re.search(title_regex, html, flags=re.S).group(1) + "》" except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) results = [] for index, video in enumerate(videos): result = HandlerOutput(one_url=video[0], video_info=HandlerOutput.VideoInfo( cover=video[1], title=title + ' 文章内视频%s' % index, )) results.append(result) return HandlerAdapter(results)
def get_item_info(sub_uri): uri = '%s%s' % (DELL_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) img_regex = '<img class="carImg".*?data-blzsrc="(.*?)" alt' img_list = re.findall(img_regex, html, flags=re.S) image_list = [] for img in img_list: image_list.append(img) result = dict() data_regex = 'Dell.Services.DataModel = (.*)' data_json = re.search(data_regex, html, flags=0).group(1) while data_json[-1] != '}': data_json = data_json[:-1] data = json.loads(data_json) result['ProductName'] = data['Stacks'][0]['Stack']['Title']['Value'] result['DellPrice'] = data['Stacks'][0]['Stack']['Pricing']['DellPrice'][ 'Value'] result['MarketValue'] = data['Stacks'][0]['Stack']['Pricing'][ 'MarketValue']['Value'] variants = data['Stacks'][0]['Specs']['TechSpecs'] for variant in variants: result[variant['Label']] = variant['Value'] tech_sections = data['Stacks'][0]['Specs']['TechSpecSectionContent'][ 'FullTechSpecsSectionGroups'] for tech_section in tech_sections: for tech_row in tech_section['TechSpecSectionGroupRows']: if 'TechSpecSectionItem1' in tech_row: result[tech_row['TechSpecSectionItem1'] ['Label']] = tech_row['TechSpecSectionItem1']['Value'] if 'TechSpecSectionItem2' in tech_row: result[tech_row['TechSpecSectionItem2'] ['Label']] = tech_row['TechSpecSectionItem2']['Value'] result['images'] = image_list return result
def handler(cls, url): try: html = abstract_grab(url) title_regex = '<p class="desc">(.*?)</p>' title = re.search(title_regex, html, flags=re.S).group(1) video_url_regex = 'playAddr: "(.*?)",' video_url = re.search(video_url_regex, html, flags=re.S).group(1) cover_regex = 'cover: "(.*?)"' cover = re.search(cover_regex, html, flags=re.S).group(1) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) result = HandlerOutput( video_info=HandlerOutput.VideoInfo( title=title, cover=cover, ), one_url=video_url, ) return HandlerAdapter([result])
def handler(cls, url): try: html = abstract_grab(url) video_url_regex = 'srcUrl="(.*?)",' video_url = re.search(video_url_regex, html, flags=re.S).group(1) soup = BeautifulSoup(html, 'html.parser') title = soup.find('h1').get_text() cover = soup.find(id='poster').find('img').get('src') except Exception as err: raise Error.ERROR_HANDLER(cls.NAME + ',' + str(err)) result = HandlerOutput( video_info=HandlerOutput.VideoInfo( title=title, cover=cover, ), one_url=video_url, ) return HandlerAdapter([result])
def get_item_list(index): uri = '%s/tools/colorful_data.ashx?action=proList&mid=102&Category=null&' \ 'typecategory=null&page_size=12&page_index=%s' % (COLORFUL_HOST, index) data = json.loads(abstract_grab(uri, phone_agent=False)) result_list = [] for item in data: item_result = dict() item_result['产品名称'] = item['title'] item_result['产品ID'] = item['id'] while True: try: item_info = get_item_info(item['id']) break except Exception: sleep(3) print(item['id'], '出错重爬') pass item_result.update(item_info) result_list.append(item_result) return result_list
def get_item_list(index): uri = '%s/en-us/shop/monitors-monitor-accessories/ar/4009?appliedRefinements=%s' % ( DELL_HOST, index) html = abstract_grab(uri, phone_agent=False) result_list = [] sub_uri_regex = '<a .*? data-testid="SnPDealsItem" .*? href="(.*?)"' sub_uri_list = re.findall(sub_uri_regex, html, flags=re.S) for href in sub_uri_list: print(href) while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass result_list.append(item_info) return result_list
def get_item_info(uri): html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') result = dict() result['产品名称'] = soup.find(id='span_product_name').text items = soup.find_all(class_='item_row') for item in items: key = item.find('div').text value = item.find(class_='col_values').text if key: result[key] = value images = soup.find(id='detail_playPicture_list').find_all('img') image_list = [] for image in images: image_list.append(image['src']) result['图片列表'] = image_list return result
def get_item_list(index): uri = '%s/search/v2?shopid=1&cat=293&page=%s&pageSize=20' % ( SEARCH_LENOVO_HOST, index) html = abstract_grab(uri, phone_agent=False) items = json.loads(html)['items'] result_list = [] for item in items: href = item['pcDetailUrl'] print(href) while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass result_list.append(item_info) return result_list
def handler(cls, url): try: o = parse.urlparse(url) qs = parse.parse_qs(o.query) vid = qs['vid'][0] data = abstract_grab(cls.INFO_API_URL % vid) data = json.loads(data) result = HandlerOutput( video_info=HandlerOutput.VideoInfo( title=data['title'], cover=data['coverForDetail'] ) ) for item in data['playInfo']: result.options.append(HandlerOutput.Option( quality=item['name'] + '(' + item['type'] + ')', urls=[HandlerOutput.Url(url=item['url'])] )) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) return HandlerAdapter([result])
def get_item_list(index): uri = '%s/memory?page=%s' % (CRUCIAL_HOST, index) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') table = soup.find(id='product_list_region') items = table.find_all('div', class_='field-content image-border') result = [] for item in items: href = item.find('a')['href'] while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass item_info['图片链接'] = item.find('img')['src'] result.append(item_info) return result
def handler(cls, url): try: html = abstract_grab(url) vid_link_regex = '<link rel="canonical" href="(.*?).html"' vid_link = re.search(vid_link_regex, html, flags=re.S).group(1) video_info_regex = 'var VIDEO_INFO = (.*?)\n</script>' video_info = re.search(video_info_regex, html, flags=re.S).group(1) data = json.loads(video_info) vid = vid_link[vid_link.rfind('/') + 1:] result = HandlerOutput( video_info=HandlerOutput.VideoInfo( title=data['title'], cover=data['pic_640_360'], ), options=VideoQQ.get_video_link(vid), ) except Exception as err: raise Error.ERROR_HANDLER(debug_message=cls.NAME + ',' + str(err)) return HandlerAdapter([result])
def get_item_list(index): uri = '%s/product/xianshiqi?p=%s' % (AOC_MONITOR_HOST, index) html = abstract_grab(uri, phone_agent=False) soup = BeautifulSoup(html, 'html.parser') table = soup.find(class_='product-list') items = table.find_all('li', class_='yuan') result_list = [] for item in items: href = item.find('a')['href'] print(href) while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass result_list.append(item_info) return result_list
def get_item_list(sub_uri): uri = '%s%s' % (SENNHEISER_HOST, sub_uri) html = abstract_grab(uri, phone_agent=False) html = html.replace('\\', '') link_regex = 'product-teaser__image\'>n<a href=\"(.*?)\">' link_list = re.findall(link_regex, html, flags=re.S) result_list = [] for href in link_list: print(href) while True: try: item_info = get_item_info(href) break except Exception: sleep(3) print(href, '出错重爬') pass result_list.append(item_info) return result_list