def get_businesses(self, response: scrapy.http.Response): print(response.css('h1 span span::text').get()) parsed = urlparse.urlparse(response.url) self.token = parse_qs(parsed.query)['id'][0] script = response.xpath('//script[contains(., "viewInstanceKey:")]/text()').get() script2 = response.xpath('//script[contains(., "containerNodeId:")]/text()').get() searchOperator = response.css('.registerItemSearch-tabs-criteriaAndButtons-criteria-itemNameSearchOperatorBox-itemNameSearchOperatorSelector::attr(id)').get() advance = response.css('.Attribute-Advanced::attr(id)').get() vi_pattern = r"viewInstanceKey:('[A-Za-z0-9_\./\\-]*')" vi_match = re.search(vi_pattern, script) vikey = vi_match.group().split(':')[1].strip("'") guid_pattern = r'guid:([0-9]*)' guid_match = re.search(guid_pattern,script) guid = guid_match.group().split(':')[1] frag_node_pattern = "containerNodeId:'([A-Za-z0-9]*)'" frag_match = re.search(frag_node_pattern, script2) frag_node = frag_match.group(1) node = response.css('.appSearchButton::attr(id)').get()[4:] self.data[f'{searchOperator}-ItemNameSearchOperator'] = 'StartsWith' self.data[f'{advance}-Advanced'] = 'N' self.data['_VIKEY_'] = vikey self.data['_CBHTMLFRAGID_'] = str(guid) self.data['_CBHTMLFRAGNODEID_'] = str(frag_node) self.data['_CBNODE_'] = node yield scrapy.http.FormRequest(url=f'https://www.mtsosfilings.gov/mtsos-master/viewInstance/update.html?id={self.token}', formdata=self.data, headers=self.headers, callback=self.pagination)
def parseCity(self, response: scrapy.http.Response): #example https://www.tripadvisor.in/Attractions-g186338-Activities-London_England.html#FILTERED_LIST attractionBoxs = response.css( 'div.attraction_list.attraction_list_short > div.attraction_element > div > div > div > div > div.listing_title' ) tourSetRegex = ".+([0-9]+).*" tourSetRegChecker = re.compile(tourSetRegex) for attraction in attractionBoxs: pointName = attraction.css('a::text').extract_first() if not tourSetRegChecker.match(pointName): attractionUrl = response.urljoin( attraction.css('a::attr(href)').extract_first()) response.meta['rank'] += 1 yield response.follow(url=attractionUrl, callback=self.parseAttractionsPage, meta=response.meta) nextPageLink = response.css( 'div.al_border.deckTools.btm > div > div.unified.pagination > a.nav.next.rndBtn.ui_button.primary.taLnk::attr(href)' ) if nextPageLink: nextPageLink = response.urljoin(nextPageLink.extract_first()) self.log("nextpage: " + nextPageLink) if response.meta['rank'] < 100: yield response.follow(nextPageLink, callback=self.parseCity, meta=response.meta)
def parseCityAttractionsListPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Mumbai/d953 print( 'PARSING ATTRACTION LIST ####################################################################################' ) print(response.url) self.incrementRequestCount() hrefs = response.css('div.ptm *> h2 > a') for href in hrefs: pointURL = href.css('::attr(href)').extract_first().strip() pointName = href.css('::text').extract_first().strip() yield response.follow(pointURL, callback=self.parseAttractionsPage, meta={ 'countryName': response.meta['countryName'], 'cityName': response.meta['cityName'], 'pointName': pointName }) nextPageLink = response.css( 'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)' ).extract_first() if nextPageLink: yield response.follow(nextPageLink, callback=self.parseCityAttractionsListPage, meta=response.meta)
def parse_listing(self, response: scrapy.http.Response): i = {} i['url'] = response.url i['expire'] = response.xpath( '//a[@class="expire"]/span/text()').extract_first() i['job-title'] = response.css('span#main-job-title *::text').extract() i['main'] = response.css('div#main-lang-block *::text').extract() i['job-details'] = response.css('div.jobdetails *::text').extract() return i
def parse(self, response: scrapy.http.Response): # Extract every link to a landing page: for title in response.css('.document-row > h3 > a'): yield response.follow(title, self.parse_landing_page) # Extract the link to the next page of results: for next_page in response.css('.next > a'): yield response.follow(next_page, self.parse)
def parse_posts_list(self, response: scrapy.http.Response): # Fetch the posts for href in response.css("#posts a::attr(href)"): if href.get().startswith("/p"): yield response.follow(href, self.parse_thread) # Fetch all pages for href in response.css(".pagination a::attr(href)"): yield response.follow(href, self.parse_posts_list)
def form_title_colors_brand(response: scrapy.http.Response): brand_and_name = response.css("div.brand-and-name.j-product-title") brand = brand_and_name.css("span.brand::text").get() name = brand_and_name.css("span.name::text").get() colors = response.css("div.color.j-color-name-container").css( "span.color::text").get() if colors: title = f"{brand} / {name}, {colors}" else: title = f"{brand} / {name}" colors = "" return title, colors, brand
def parse(self, response: scrapy.http.Response): comic = ManhuaItem() comic['comic_id'] = response.url.split('/')[-1] comic['url'] = response.urljoin(response.url) comic['thumbnail'] = response.urljoin( response.css('div.cover img::attr(src)').get()) comic['title'] = response.css('td.comic-titles::text').get() comic['original_title'] = response.css( 'td.comic-original-titles::text').get() comic['author'] = response.css('ul.creators li a::text').getall() comic['summary'] = response.css('p.comic_story::text').get() comic['state'] = response.css('a.comic-pub-state::text').get() comic['duration'] = '-'.join( response.css('td.pub-duration a::text').getall()) comic['vols'] = dict() res = [] vol_urls = response.css( 'ol.links-of-books.num_div li a::attr(href)').getall() for i, url in enumerate(vol_urls, start=1): comic['vols']['vol-%s' % i] = response.urljoin(url) request = scrapy.Request(response.urljoin(url), callback=self.parse_vol) request.meta['comic_id'] = comic['comic_id'] request.meta['id'] = 'vol-%s' % i res.append(request) yield comic for request in res: yield request
def parseCountryAttractionsListPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/Netherlands/d60 self.incrementRequestCount() hrefs = response.css('div.ptm *> h2 > a::attr(href)').extract() for href in hrefs: yield response.follow(href, callback=self.parseAttractionsPage) nextPageLink = response.css( 'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)' ).extract_first() if nextPageLink: yield response.follow( nextPageLink, callback=self.parseCountryAttractionsListPage)
def parse_item_page(self, response: scrapy.http.Response): thumb_urls = response.css('.alt-images__thumb img::attr(src)').getall() first_image_url = response.css( '.view-product-image-print::attr(src)').get() image_urls = generate_image_urls(thumb_urls, first_image_url) item = HouzzProductItem() item['url'] = response.url item['title'] = response.css('.view-product-title::text').get() item['keywords'] = response.css( '.product-keywords__word::text').getall() item['images'] = image_urls[:2] yield item
def form_price(response: scrapy.http.Response) -> item.Price: current_price_raw: str = response.css("span.final-cost::text").get() original_price_raw: str = response.css("del.c-text-base::text").get() current_price = price_to_float(current_price_raw) if original_price_raw is not None: original_price = price_to_float(original_price_raw) ratio = current_price / original_price sale = 1 - ratio sale_tag = f"Скидка {int(100 * sale)}%" else: original_price = current_price sale_tag = "" return item.Price(current=current_price, original=original_price, sale_tag=sale_tag)
def parse(self, response: scrapy.http.Response): issue_list = [] for li_item in response.css("section.authors-list ol li"): issue_list += li_item.css("a::attr(href)").extract() issue_list = [x for x in issue_list if not "suppl" in x] article_list = [] for url in issue_list: json_url = str(url) + "/JSON" res = urllib.request.urlopen(json_url) res_body = res.read() d = json.loads(res_body.decode("utf-8")) collection_index = [] for key in d.keys(): try: collection_index.append(int(key)) except: pass collection_index = sorted(collection_index) article_count = [] for count in collection_index: article_count.append(len(d[str(count)]['articles'])) index = 0 for x in collection_index: for y in range(article_count[index]): article_list.append(d[str(x)]['articles'][y]['articleUrl']) index += 1 file = open("article_list.txt", "w") file.write("\n".join(article_list)) file.close()
def parseCountryPage(self, response: scrapy.http.Response): # example page: https://www.viator.com/India/d723-ttd self.incrementRequestCount() breadcrumbs = response.css('div.crumbler *> span::text').extract() countryName = breadcrumbs[1].strip() countryListing = CountryListing(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=countryName) yield countryListing.jsonify() if skipNonRequired: if processName(countryName) not in processedRequiredCountries: # do not process this country's cities print('Skipping country: ', countryName) return countryId = response.url.split('/')[-1].split('-')[0][1:] cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format( countryId) yield response.follow(cityListingURL, callback=self.parseCountryCities, meta={'countryName': countryName})
def form_product(response: scrapy.http.Response) -> item.Product: url = response.request.url article = url_to_article(url) title, colors, brand = form_title_colors_brand(response) price = form_price(response) assets = form_assets(response) metadata = form_metadata(response) return item.Product( RPC=article, url=url, title=title, marketing_tags=response.css("li.about-advantages-item::text").getall(), brand=brand, section=response.meta["section"], price_data=price, stock=item.Stock( in_stock=True, # didn't found any info on site count=-1, # didn't found any info on site ), assets=assets, metadata=metadata, variants=len(colors.split()))
def parse_docs(self, response: scrapy.http.Response): pdfs: List[str] = [] for url in response.css('a::attr(href)'): full = response.urljoin(url.extract()) if full.endswith('.pdf'): pdfs.append(full) yield {'from': response.url, 'file_urls': pdfs}
def form_assets(response: scrapy.http.Response): # assets zoomed_image = response.css("img.MagicZoomFullSizeImage::attr(src)").get() images_set = response.css("a.j-carousel-image::attr(href)").getall() view_3d_list = [] view_3d_base_raw = response.css( "div.j-3d-container.three-d-container::attr(data-path)").get() if view_3d_base_raw: for i in range(1, VIEW360_IMAGES_COUNT + 1): url = ''.join(['http:', view_3d_base_raw, '/', str(i), '.jpg']) view_3d_list.append(url) return item.Assets(main_image=zoomed_image, set_images=images_set, view360=view_3d_list, video=[])
def parse(self, response: scrapy.http.Response): # example page: https://www.viator.com/Amsterdam/d525-ttd countryMenuBox = response.css( '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)' ) hrefs = countryMenuBox.css('a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCountryPage)
def parse(self, response: scrapy.http.Response, **kwargs): race_name = response.css('dl.racedata h1::text').get() race_conditions = re.split( r'\xa0/?\xa0', response.css('dl.racedata diary_snap_cut span::text').get()) race_data = response.meta['race_data'] race_data['race_name'] = race_name race_data['race_condition'] = { 'has_turf': '芝' in race_conditions[0], 'has_dirt': 'ダ' in race_conditions[0], 'has_obstacle': '障' in race_conditions[0], } if (distance_match := re.match(r'\D*\s*0*(?P<distance>\d+)m', race_conditions[0])) is None: race_data['race_distance'] = None
def parse(self, response: scrapy.http.Response): self._scraped_indexes = self._scraped_in_past # extract url from main article in img spotted_event = response.css('.main-news')[0] path = spotted_event.xpath('div/div/a/@href').extract_first() yield from self._yield_request(path) # extract urls from list yield from self._yield_requests_from_response(response)
def get_links(response: scrapy.http.Response) -> List[Tuple[str or None, str]]: """ Get links in the page. :param response: A scrapy.http.Response that contains the page :return: A list of tuple (title, url) """ link_list = [(a_node.xpath('.//text()').get(), a_node.attrib['href']) # Make a tuple of title, href for a_node in response.css('a[href]')] return filter_links(link_list)
def __call__(self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider): assert isinstance(spider, scrapy.Spider) ret: List[Any] = [] if Price in to_provide: ret.append(Price(response.css(".price::text").get())) if Html in to_provide: ret.append(Html("Price Html!")) return ret
def __call__(self, to_provide, response: scrapy.http.Response, settings: Settings): assert isinstance(settings, Settings) ret: List[Any] = [] if Name in to_provide: ret.append(Name(response.css(".name::text").get())) if Html in to_provide: ret.append(Html("Name Html!")) return ret
def parse_landing_page(self, response: scrapy.http.Response): # On a landing page, we can extract all the documents, or infer the JSON link and use that. # yield {'title': pub.css('h1 ::text').extract_first().strip()} for pub in response.css('.publication'): # This is a publication, so let's infer the API link: lp_url = list(urlsplit(response.url)) lp_url[2] = "/api/content%s" % lp_url[2] api_json_url = urlunsplit(lp_url) yield response.follow(api_json_url, self.parse_content_api_json)
def parseReviewsPage(self, response: scrapy.http.Response): box = response.css('div.rating.reviewItemInline') content = response.css( 'div.entry > p.partial_entry::text').extract_first() ratingDate = box.css('span::attr(title)').extract_first() rating = float( box.css('span::attr(class)').extract_first().split('_')[-1]) / 10 rating = scaleRating(rating, 1, 5) yield Review(crawler=self.name, sourceURL=response.url, crawlTimestamp=getCurrentTime(), countryName=response.meta['countryName'], cityName=response.meta['cityName'], pointName=response.meta['pointName'], content=content, rating=rating, date=ratingDate).jsonify()
def get_next_vimeo_overview_page(self, response: scrapy.http.Response): """ if there is a "next"-button at the bottom of the vimeo-user's overview page: grabs the url from it and yields it """ # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get() next_vimeo_overview_page = response.css( '#pagination > ol > li.pagination_next a::attr(href)').get() if next_vimeo_overview_page is not None: yield response.follow(next_vimeo_overview_page, self.parse)
def parse_page(self, response: scrapy.http.Response): image_url = response.css( 'div#all div.text-center img.img-fluid::attr(src)').get() image_url = response.urljoin(image_url) image = ImageItem() image['comic_id'] = response.meta['comic_id'] image['vol_id'] = response.meta['vol_id'] image['page'] = response.meta['page'] image['url'] = image_url yield image
def parse(self, response: scrapy.http.Response): url = response.url abstract = response.css("article.abstract") num_abstract_sections = len(abstract.css("p span::text")) out = dict([(abstract.css("p span::text")[i].extract()[:-2].lower(), abstract.css("p::text")[i].extract()) for i in range(num_abstract_sections)]) out.update(url=url) main_article = response.css("article.main-article") items = main_article.xpath("./*/text()").extract() whole_string = "" for i in items: whole_string += i whole_string += "\n" whole_string = re.sub( r"(\[\n(.*?)\])|(\[(.*?)\n\])|(\[\n(.*?)\n\])|(\[(.*?)\])", "", whole_string) saved_string = whole_string # temp whole_string = re.sub(r"\n", " ", whole_string) # temp out.update(article_text=whole_string) references = {} reference_text = [] reference_url = [] ol = response.xpath('//div[@class="footnotes"]/ol') for span in ol.xpath('./li/span'): reference_text.append(span.xpath('./text()').extract()) try: reference_url.append( span.xpath('./a[@target="_blank"]/@href').extract()) except: reference_url.append("") for index in range(len(reference_text)): references[str(reference_text[index])] = str(reference_url[index]) out.update(references=references) self.logger.warning('Out: %s', out) yield out
def parse(self, response: scrapy.http.Response, **kwargs): section: list = response.css("ul.bread-crumbs").css( "span::text").getall() for product in response.css('div.dtList.i-dtList.j-card-item'): product_ref = product.css( "a.ref_goods_n_p.j-open-full-product-card::attr(href)").get() prod_card_url = response.urljoin(product_ref) yield common_request(url=prod_card_url, callback=self.parse_product_card, meta={'section': section}) if self.small_sample: break next_page_ref = response.css("a.pagination-next::attr(href)").get() if next_page_ref is not None: next_page_url = response.urljoin(next_page_ref) yield common_request(url=next_page_url, callback=self.parse, meta={'section': section})
def parseX(self, response: scrapy.http.Response): #"https://www.trip.skyscanner.com/bangkok/things-to-do hrefs = response.css('div.items_list *> h2 > a::attr(href)').extract() for href in hrefs: self.log("visiting: " + href) response.meta['rank'] += 1 yield response.follow(href, callback=self.parseAttractionsPage, meta=response.meta) nextPageLink = response.css( 'div.items_list > div:nth-child(2) > ul > li.next.next_page > a::attr(href)' ).extract_first() if nextPageLink: self.log("nextpage: " + nextPageLink) if response.meta['rank'] < 100: yield response.follow(nextPageLink, callback=self.parseX, meta=response.meta)
def parseCountryCities(self, response: scrapy.http.Response): # example page: https://www.viator.com/pascities.jspa?country=723 self.incrementRequestCount() hrefs = response.css( 'div.unit.size-pas-cities *> a::attr(durl)').extract() for href in hrefs: yield response.follow(href, callback=self.parseCityPage, meta=response.meta)