def most_popular_page(self, response: HtmlResponse): description_list = response.css('div.descriptionContainer') for item in description_list: title = item.css('a::text').extract_first() sub_link = item.css('a::attr(href)').extract_first() channel_url = response.urljoin(sub_link) self.logger.warning('get channel:{0} ,link is:{1}'.format( title, channel_url)) yield scrapy.Request(channel_url, callback=self.channel_page_see_all) # determine has next page next_page_li = response.css('li.page.next.wl-page') if next_page_li: next_page_sub_link = next_page_li.css( 'a::attr(href)').extract_first() page_number = int(next_page_sub_link.split('page=')[1]) page_number_start = self.settings.get('PAGE_NUMBER_START') page_number_end = self.settings.get('PAGE_NUMBER_END') if page_number_end is not None: if page_number_start < page_number <= page_number_end: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page) else: self.logger.warning('has next page, but is in limit') else: next_page_url = response.urljoin(next_page_sub_link) self.logger.warning( 'has next page, url is:{0}'.format(next_page_url)) yield scrapy.Request(next_page_url, callback=self.most_popular_page)
def parse_task(self, response: HtmlResponse, subsection='empty'): # Source task_name = response.css('table.viewingtable div.componentboxheader::text').extract_first().strip() source = TaskSourceItem() source['name'] = f'{task_name} (problems.ru)' source['url'] = response.url content = response.css('table.viewingtable .componentboxcontents') # Themes info = content.css('table.problemdetailscaptiontable') themes = [theme.strip() for theme in info.css('.problemdetailssubject .problemdetailssubjecttablecell a.componentboxlink::text').extract()] # Grades _, grades = info.css('.problemdetailsdifficulty nobr::text').extract() grades = list(map(lambda n: int(n), re.findall(r'\d+', grades))) # Task task_dict, image_urls, tex_used = self.extract_task(content, response) yield ParseResultItem( source=source, themes=themes, grades=grades, task=task_dict, section=SECTION, subsection=subsection, image_urls=image_urls, tex_used = tex_used )
def video_page(self, response: HtmlResponse): video_title = response.css('h1.title').css('span::text').get() video_channel = response.css('div.video-actions-container').css( 'div.usernameWrap.clearfix').css('a::text').get() js = response.css('div.video-wrapper').css('#player').css( 'script').get() data_video_id = response.css('div.video-wrapper').css( '#player::attr(data-video-id)').get() prepare_js = js.split('<script type="text/javascript">')[1].split( 'loadScriptUniqueId')[0] exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id) js_result = js2py.eval_js(exec_js) # type: js2py.base.JsObjectWrapper quality_items = js_result.to_list() # type: list quality = quality_items[-1]['text'].split('p')[0] if int(quality) >= 720: video_url = quality_items[-1]['url'] self.logger.info('parse [%s] success, url: %s', video_title, video_url) if self.settings.get('ENABLE_SQL'): result = self.data_base.select_all_by_title_my_follow( video_title) if len(result) != 0: for line in result: self.logger.error('has duplicate record: %s', line) else: self.data_base.save_my_follow(video_title, video_channel, video_url, response.url) yield PornhubItem(file_urls=video_url, file_name=video_title, file_channel=video_channel)
def parse_region(self, response: HtmlResponse): """Parse regions. Nordbayern -> Frankenjura Nord Example: https://www.frankenjura.com/klettern/region/2 """ item = SectorItem() item["name"] = response.meta["region_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["description"] = response.css( 'div[class="location-head"]+p ::text').get() yield item region = item.django_model.objects.get(**item) sub_regions = response.css('div[class="column"]').css( 'a[href*="region"]') for sub_region in sub_regions: meta = { "sub_region_name": sub_region.css("::text").get(), "parent": region } yield response.follow(sub_region, self.parse_sub_region, meta=meta)
def _create_product_data_dictionary( self, response: HtmlResponse, name: str, brand: Optional[str] = None, model_number: Optional[str] = None, upc: Optional[str] = None, data: Optional[Dict] = None, ) -> Dict: breadcrumbs = response.css('ul.nav.breadcrumb \ > li[itemtype="http://data-vocabulary.org/Breadcrumb"] \ > a[itemprop="url"] \ > span[itemprop="title"]::text').getall() item = product_data_item_loader \ .ProductDataItemLoader(response=response) \ .add_language_data( response=response, brand=brand, images=response.css( 'meta[property="og:image"]::attr(content)' ).extract(), name=name, url=response.url, breadcrumbs=breadcrumbs ).add_sku(sku=upc) \ .add_upc(response=response, upc=upc) \ .add_store_id(store_id=self.store_id) \ .add_sold_by(sold_by=self.sold_by) \ .add_version(version=self.version) \ .load_item() return item.get_dictionary()
def parse_video_page(self, response: HtmlResponse): self.logger.warn('开始解析{0}真实视频'.format(response.url)) title = response.css('#viewvideo-title::text').extract_first().strip() author = response.css('a[href*="uprofile.php"]').css( 'span::text').extract_first().strip() # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同 view_key = response.url.split('viewkey=')[1].split('&')[0] # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下 if '/' in title: title = title.replace('/', '') encrypted_url = response.css('video').extract_first().split( 'strencode("')[1].split('"))')[0] first_encrypted = encrypted_url.split('"')[0] second_excrypted = encrypted_url.split('"')[2] video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted) if video_link: # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url video_link_list = video_link.split('//') real_video_link = video_link_list[0] + '//' + video_link_list[ 1] + '/' + video_link_list[2] self.logger.warn('获取到下载链接,丢入下载队列') down_file_name = title + '-' + author + '-' + view_key yield DownloadVideoItem(file_urls=real_video_link, file_name=down_file_name) self.logger.warn('丢入下载后,更新数据库') yield UpdateMovieLinkItem(movie_page_url=response.url, movie_real_url=real_video_link) else: self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
def model_page(self, response: HtmlResponse): video_sum_element = response.css('div.showingInfo').css( 'span.totalSpan') # some p**n star hasn't show video number page_number = 1 if video_sum_element: video_sum = video_sum_element.css('::text').get() sum_number = int(video_sum) page_number = math.ceil(sum_number / 40) # url contains page means load all videos || num == 1, start parse if 'page' in response.url or page_number == 1: li_list = response.css('div.videoUList').css('ul').css('li') for li_tag in li_list: # type: SelectorList a_tag = li_tag.css('span.title').css('a') video_title = a_tag.css('::text').get() video_url = a_tag.css('::attr(href)').get() real_url = 'https://www.pornhubpremium.com' + video_url self.logger.info('send [%s] ,url: %s', video_title, video_url) yield scrapy.Request(real_url, callback=self.video_page, priority=100) else: # url not contains page and num > 1 means need load all videos new_link = '{0}?page={1}'.format(response.url, page_number) yield scrapy.Request(new_link, callback=self.model_page, priority=10)
def parse(self, response): driver = WebDriver driver.get(response.url) content = driver.page_source.encode('utf-8') response = HtmlResponse(response.url, encoding='utf-8', body=content) pageStr = response.css( ".dw_page .p_box .p_wp .p_in >span::text")[0].extract() pageNum = int(pageStr[1:pageStr.find("页", 1, len(pageStr))]) if pageNum - 1 > 0: urlss = [] for i in range(1000): js = "document.getElementById('jump_page').value=%d" % (i + 2) driver.execute_script(js) driver.find_element_by_css_selector(".p_in .og_but").click() content = driver.page_source.encode('utf-8') sub_response = HtmlResponse(url=response.url, encoding='utf-8', body=content) main = sub_response.css("div#resultList div.el") urls = main.css(".t2 a::attr(href)").extract() urlss.append(urls) for index in range(len(urlss)): for url in urlss[index]: try: for obj in self.__parse_by_webdriver(url): yield obj except Exception as error: print("【" + str(index) + "】spirder error: {0}".format(error)) continue
def _create_product_dictionary( self, response: HtmlResponse, data: Optional[Dict] = None, ) -> product.Product: try: upc = (universal_product_code.UniversalProductCode( upc=data.get('ProductId').replace('_', ''))).value except: # TODO: Log issue and return nothing. return None title1 = response.css('meta[property="og:title"]::attr(content)' ).extract()[0].split('|')[0] title2 = response.css('title::text').get() name = title1 or title2 if not name: pass # TODO: Log error and return none. elif name == 'Grocery Product' or name == 'Produit épicerie en ligne': pass # TODO: Log error and return none. brand = data.get('BrandName') if not name: pass # TODO: Log error and return none. item_loader = product_item_loader.ProductItemLoader( response=response ).add_name( response=response, name=name, # TODO: What about if it's none. language=self.language, ).add_brand( response=response, brand=brand, # TODO: What about if it's none. language=self.language, ).add_upc(response=response, upc=upc) \ .add_product_data_dictionary( product_data_dictionary=self._create_product_data_dictionary( response=response, data=data, name=name, brand=brand, upc=upc, ), ).add_offer_dictionary( offer_dictionary=self._create_offer_dictionary( response=response, data=data, ), ).add_store_dictionary( store_dictionary=self._create_store_dictionary( response=response, ), ).add_supported_language(language=self.language) return item_loader.load_item()
def get_image_url(cls, response: HtmlResponse) -> Union[str, None]: """Extract image url from html response""" image_p = response.css("p > img") image_figure = response.css("figure > img") image_selectors = image_p if image_p else image_figure images_re = image_selectors.re(r'src="(http.*?)\"') images = [img for img in images_re if img.split(".")[-1] != "svg"] sorted_by_length = sorted(images, key=len, reverse=True) return sorted_by_length[0] if sorted_by_length else None
def video_parse(self, response: HtmlResponse, category): title = response.css('h2.title.big::text').get() for item in response.css('ul.video-downloads-buttons').css('li'): if '1080p' in item.css('a::text').get().strip(): link = item.css('a::attr(href)').get() req_cookie = response.request.headers.get('Cookie').decode() resp_cookie = response.headers.get('Set-Cookie').decode().split(';')[0] yield ArtPornItem(name=title, link=link, category=category, cookie='{0};{1}'.format(req_cookie, resp_cookie))
def video_parse(self, response: HtmlResponse, category): link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get()) title = '' for i in response.css('div.title_bar::text').getall(): i = i.strip() if i: title = i break if link != 'http://www.hotwiferio.com/members/': yield HotItem(name=title, link=link, category=category)
def categories_parse(self, response: HtmlResponse, category): next_url_list = response.css('a.button.prev::attr(href)').getall() if len(next_url_list) > 1: yield scrapy.Request(url=response.urljoin(next_url_list[1]), callback=self.categories_parse, cb_kwargs={'category': category}) else: yield scrapy.Request(url=response.urljoin(next_url_list[0]), callback=self.categories_parse, cb_kwargs={'category': category}) for item in response.css('div.thumb-video.cf').css('a.thumb-video-link::attr(href)').getall(): yield scrapy.Request(url=item, callback=self.video_parse, cb_kwargs={'category': category})
def _create_product_dictionary( self, response: HtmlResponse, data: Optional[Dict] = None, ) -> product.Product: try: upc = (universal_product_code.UniversalProductCode( upc=response.css('span[itemprop="sku"]::text').get())).value except Exception as exception: logging.exception(msg='Unable to get UPC.', exc_info=exception) return None name1 = response.css( "div.product-info.item-addToCart > a.invisible-text::text" ).extract() name2 = response.css('title::text').extract()[0].split('|')[0] name = name1 or name2 if not name: pass # TODO: Log error and return none. brand = response.css('div[itemtype="http://schema.org/Product"] \ > span[itemprop="brand"]::text').extract() item_loader = product_item_loader.ProductItemLoader( response=response ).add_name( response=response, name=name, language=self.language, ).add_brand( response=response, brand=brand, language=self.language, ).add_upc(response=response, upc=upc) \ .add_product_data_dictionary( product_data_dictionary=self._create_product_data_dictionary( response=response, data=data, name=name, brand=brand, upc=upc, ), ).add_offer_dictionary( offer_dictionary=self._create_offer_dictionary( response=response, data=data, ), ).add_store_dictionary( store_dictionary=self._create_store_dictionary( response=response, ), ).add_supported_language(language=self.language) return item_loader.load_item()
def _create_offer_dictionary( self, response: HtmlResponse, data: Dict, ) -> Dict: offers = response.css('div[itemprop="offers"]') if len(offers) == 0: pass # TODO: Throw error. offer_objects = [] for o in offers: price = o.css('span[itemprop="price"]::text').get() valid_through = o.css('span[itemprop="validThrough"]::text').get() offer_objects.append( offer.Offer(price=float(price), valid_until=valid_through)) # Order to get the sales price. offer_objects.sort(key=lambda x: x.price) amount = offer_objects[0].price valid_until = offer_objects[0].valid_until # TODO: Add valid until. item = offer_item_loader.OfferItemLoader(response=response) \ .add_store_id(store_id=self.store_id) \ .add_sold_by(sold_by=self.sold_by) \ .add_amount( amount=str(amount), ).add_currency(currency=curreny.Currency.CAD.value) \ .add_availability( availability=availability.Availability.IN_STOCK.value, ).add_condition(condition=condition.Condition.NEW.value) \ .load_item() return item.get_dictionary()
def get_comment_links(response: HtmlResponse, spider: ColgSpider): url = [] lastPageUrls = response.css('.tps a:last-child').xpath('@href').extract() for lastPageUrl in lastPageUrls: url.extend( get_commentlistpage_urls_near_endof_lastpage(lastPageUrl, spider)) return url
def _find_json_data(self, response: HtmlResponse) -> Optional[Dict]: css_path = "div.product-details.js-ga-productdetails > " + \ "div.relative::attr(data-product)" product_data = response.css(css_path).extract() if not product_data: logging.error('Unable to load JSON data.') # TODO: Log URL. return None try: return json.loads(product_data[0]) except: pass # try: # return ast.literal_eval(product_data[0]) # except: # pass try: data = product_data[0].replace("'", '"') return json.loads(data) except: logging.error('Unable to load JSON data.')
def parse_category(self, response: HtmlResponse) -> HtmlResponse: """ List category and traverse product pages. """ products_query = response.css( "section#bc-sf-filter-products > div.product-grid-item") if not products_query: raise IgnoreRequest('Product items not found') self.logger.info( f'parse product_categories len: {len(products_query)}') for pdp in products_query.css('div.product-grid-item'): item_loader = ProductLoader(item=UrgeItem(), selector=pdp) item_loader.add_css('product_name', 'div.product-text > p.title::text') item_loader.add_css('product_brand', 'div.product-text > h2.vendor.h5::text') # get regular product price through OR (,). item_loader.add_css( 'product_price', 'div.product-text p.price s::text , span[itemprop="price"]::text' ) item_loader.add_css( 'product_sale_price', 'div.product-text p.sale span[itemprop="price"]::text') if 'href' in pdp.css('a').attrib: product_url = pdp.css('a').attrib['href'] yield response.follow(product_url, callback=self.product_page, meta={'item': item_loader.load_item()})
def _parse_stars(response: HtmlResponse): """Parse count of stars given for walls and routes.""" stars_selector = response.css("img[class*=stars]") if stars_selector: return int(stars_selector[0].attrib["class"].split("stars")[1]) else: return 0
def parse(self, response: HtmlResponse): list_channel = response.css('div.listchannel') for item in list_channel: link = item.css('a::attr(href)').extract_first() title = item.css('a::attr(title)').extract_first() self.logger.warn('获取到视频:{0}'.format(title)) yield scrapy.Request(url=link, callback=self.real_video_parse)
def parse_wall(self, response: HtmlResponse): """Parse walls. ... -> Region Wattendorf -> Falkenwand Example: https://www.frankenjura.com/klettern/poi/21 """ item = SectorItem() item["name"] = response.meta["wall_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["internal_rating"] = _parse_stars(response) item["max_height_in_m"] = _parse_wall_max_height(response) item["rain_protected"] = _parse_rain_protected(response) item["child_friendly"] = _parse_child_friendly(response) item["description"] = _parse_wall_description(response) item["approach"] = _parse_wall_approach(response) item["approach_road"] = _parse_wall_approach_road(response) item["fk_orientation"] = _parse_orientation(response) item["latitude"], item["longitude"] = _parse_lat_lon(response) yield item wall = item.django_model.objects.get(name=item["name"], fk_sector=item["fk_sector"]) routes = response.css('div[class="poi-link-container"]').css("a") for route in routes: meta = {"route_name": route.css("::text").get(), "parent": wall} yield response.follow(route, self.parse_route, meta=meta)
def ajax_model_page(self, response: HtmlResponse): model_info_list = response.css('li.pcVideoListItem') for item in model_info_list: # type: SelectorList video_url = item.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100)
def parse(self, response: HtmlResponse): """Parse webpage to extract important recipe information""" if response.css(".wprm-recipe-ingredients-container"): data = { "name": response.css(".title::text").get(), "source_id": self.sid, "url": response.url, "image": self.get_image_url(response), "ingredients": self.get_ingredients(response), } if all(val is not None for val in data.values()): resp = requests.post(self.endpoint, json=data) if resp.status_code == 400: raise CloseSpider("Recipe already exists") for anchor_tag in response.css(".nav-previous a"): yield response.follow(anchor_tag, callback=self.parse)
def _retrieve_table_field( response: HtmlResponse # pylint: disable=C0330 ) -> Generator[Selector, None, None]: """ Yield single table cell. """ for field in response.css('.apexir_WORKSHEET_DATA td'): yield field
def channel_page(self, response: HtmlResponse): video_css = response.css('span.title') for item in video_css: video_sub_link = item.css('a::attr(href)').extract_first() video_url = response.urljoin(video_sub_link) self.logger.warning( 'send to parse real video, url is:{0}'.format(video_url)) yield scrapy.Request(video_url, callback=self.video_page)
def _parse_documents(response: HtmlResponse): """ Parse principal's documents. """ data = response.meta['data'] data['exhibit_url'] = response.css( 'td[headers=DOCLINK] ::attr(href)').extract_first() yield data
def _parse_child_friendly(response: HtmlResponse): """Parse optional element 'child friendly'.""" selector = response.css('th:contains("Kinder") + td ::text') if selector: value = selector.get() return False if "Ungeeignet" in value.lower() else True else: return False
def _parse_rain_protected(response: HtmlResponse): """Parse optional element 'rain protected'.""" selector = response.css('th:contains("Regensicher") + td ::text') if selector: value = selector.get() return False if value.lower() == "nein" else True else: return False
def parse_next_link(self, response: HtmlResponse) -> str: next_page_tag = response.css( 'a[href*="?category=long&viewtype=basic"]') next_link = None for item in next_page_tag: if '»' == item.css('a::text').extract_first(): ori_link = item.css('a::attr(href)').extract_first() next_link = response.urljoin(ori_link) return next_link
def porn_star_page(self, response: HtmlResponse): # p**n star type no need page number,because next page=2 not show all 2 page videos li_list = response.css('div.videoUList').css('ul').css('li') for li_tag in li_list: # type: SelectorList video_url = li_tag.css('span.title').css('a::attr(href)').get() yield scrapy.Request(response.urljoin(video_url), callback=self.video_page, priority=100) # check has next button page_element = response.css('div.pagination3') if page_element: # if in last page, page_next css not exist next_element = page_element.css('li.page_next') if next_element: next_url = next_element.css('a::attr(href)').get() yield scrapy.Request(response.urljoin(next_url), callback=self.porn_star_page, priority=10)