def parse_subtopic_triangle(self, response: scrapy.http.Response): # Gathers all subtopics from https://www.walter-fendt.de/html5/mde/tl/tl_start_de.htm triangle_subtopics = response.xpath( '/html/body/ul/li/a/@href').getall() for subtopic_url in triangle_subtopics: subtopic_url = response.urljoin(subtopic_url) yield scrapy.Request(url=subtopic_url, callback=self.parse)
def parse(self, response: scrapy.http.Response): """ Parses content from a html page response. """ listings = response.xpath('//li[@class="result-row"]') for listing in listings: # Relative matching date = listing.xpath( './/*[@class="result-date"]/@datetime').extract_first() url = listing.xpath( './/a[@class="result-title hdrlnk"]/@href').extract_first() title = listing.xpath( './/a[@class="result-title hdrlnk"]/text()').extract_first() yield scrapy.Request(url, callback=self.parse_listing, meta=dict(date=date, url=url, title=title)) # Move to the next page of data. next_page_url = response.xpath( '//*[@class="button next"]/@href').extract_first() if next_page_url: # url must be absolute. abs_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(url=abs_next_page_url, callback=self.parse)
def parse(self, response: scrapy.http.Response): findform = response.xpath("//form[@name='form1']") form = self.build_form(findform) if "kirjaamo" not in form: raise ValueError("kirjaamo not found") if not isinstance(form["kirjaamo"], list): raise ValueError("kirjaamo is not list") method = findform.xpath("./@method").get() action = response.urljoin(findform.xpath("./@action").get()) alist = form["kirjaamo"] del form["kirjaamo"] for param in alist: val = param["value"] if val == "": continue fdata = form fdata["kirjaamo"] = val yield scrapy.FormRequest( action, method=method, formdata=fdata, meta={ "name": param["name"], "dont_cache": True, }, callback=self.parse_search_result, )
def parse_apollonian_subtopic(self, response: scrapy.http.Response): # Gathers variant-URLs to crawl from https://www.walter-fendt.de/html5/mde/apolloniosproblem_de.htm apollonios_subtopics = response.xpath( '//table/tbody/tr/td/a/@href').getall() for apollo_url in apollonios_subtopics: apollo_url = response.urljoin(apollo_url) yield scrapy.Request(url=apollo_url, callback=self.parse)
def parse_docs(self, response: scrapy.http.Response): pdfs: List[str] = [] for url in response.css('a::attr(href)'): full = response.urljoin(url.extract()) if full.endswith('.pdf'): pdfs.append(full) yield {'from': response.url, 'file_urls': pdfs}
def parse(self, response: scrapy.http.Response): """ Get list of tunes """ u: SplitResult = urlsplit(response.url) q: dict = dict(queryparse(u.query)) for tune in response.xpath( "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]" ): artist = "".join(tune.xpath("./td[2]//text()").getall()).strip() title = "".join(tune.xpath("./td[1]//text()").getall()).strip() link = tune.xpath("./td[1]/a/@href").get().strip() fileformat = "".join( tune.xpath("./td[3]//text()").getall()).strip().lower() # Download tune yield scrapy.Request( response.urljoin(link), callback=self.download_mod, meta={ "tune": { "id": q['view'], "artist": artist, "title": title, "format": fileformat, } }, )
def crawl_next_url_level(self, diff_set, response: scrapy.http.Response, url_depth: int): # To fetch all sub-pages of the website, we must grab all the unique urls from the navigation sidebar first # The sidebar has several levels, e.g.: # Welt 1st level links (= url_depth 1) # Afrika 2nd level links (= url_depth 2) # Afrika ( - D) 3rd level links (= url_depth 3) # Ägypten 4th level links (= url_depth 4) # each of these layers has its own .html and only shows the child-links if you navigate through the parent-node. # For example, to see "landkarten_aegypten.html", we need to be on the "Afrika ( - D)"-level of the sidebar. if diff_set.issubset(self.navigation_urls) is False: self.navigation_urls.update(diff_set) if len(diff_set) > 0: # print("Found", (len(diff_set)), "new URLs to crawl on url_depth =", url_depth) for diff_item in diff_set: # print(diff_item) temp_url = response.urljoin(diff_item) if url_depth == 1: yield scrapy.Request( url=temp_url, callback=self.get_navigation_urls_second_level) if url_depth == 2: yield scrapy.Request( url=temp_url, callback=self.get_navigation_urls_third_level) if url_depth == 3: yield scrapy.Request( url=temp_url, callback=self.get_navigation_urls_fourth_level)
def parse_page(self, response: scrapy.http.Response) -> Iterator[Issue]: pattern = re.compile(r".*\-b\.pdf") links = response.xpath("//a[@href]") for link in links: if re.match(pattern, link.attrib["href"]): item = Issue( cover=response.urljoin( link.xpath("./../../..").xpath( ".//img").attrib["src"]), file=response.urljoin(link.attrib["href"]), ) text = response.css("h2::text").get() if text: item["text"] = text yield item
def parse_motherboard(self, response: scrapy.http.Response): query = dict(queryparse(urlsplit(response.url).query)) current_page = int(query['page']) data = json.loads(response.body) for memmodule in data['results']: if 'url' in memmodule: memmodule['url'] = response.urljoin(memmodule['url']) remove_keys = [ 'stock', 'priceRange', 'availableForPickup', ] for idx, (k, v) in enumerate(memmodule.items()): if v is None: remove_keys.append(k) for k in remove_keys: if k in memmodule: del memmodule[k] if 'stock' in memmodule: del memmodule['stock'] yield Memory({ '_manufacturer': self.manufacturer, '_model': response.meta['model'], 'memory': memmodule, }) if current_page == 0 and data['pagination']['numberOfPages'] > 1: for pnum in range(1, data['pagination']['numberOfPages']): query['page'] = str(pnum) # Call the same page with increased page number yield scrapy.Request( response.urljoin("?" + urlencode(query)), callback=self.parse_motherboard, meta={ 'model': response.meta['model'], }, )
def parse_page(self, response: scrapy.http.Response): image_url = response.css( 'div#all div.text-center img.img-fluid::attr(src)').get() image_url = response.urljoin(image_url) image = ImageItem() image['comic_id'] = response.meta['comic_id'] image['vol_id'] = response.meta['vol_id'] image['page'] = response.meta['page'] image['url'] = image_url yield image
def parse_topic_overview(self, response: scrapy.http.Response): # Each topic (e.g. "Bruchzahlen / Bruchrechnen") holds a list of sub-topics that are either individual # .htm-pages with explanations about a specific topic # eLearning-exercises or # "Aufgabengeneratoren" inside a .xls file topic_urls = response.xpath('/html/body/table/tr/td/a/@href').getall() # print("Topic URLs:", topic_urls) # print("Number of topic_urls in this section:", len(topic_urls)) url_set = set() # xls_set = set() for url in topic_urls: if url.endswith('.htm') or url.endswith('.html'): # topics that consist of illustrations or explanations are found inside individual .htm-documents current_url = response.urljoin(url) url_set.add(current_url) # if url.endswith('.xls'): # # there are currently 3 links to .xls files, which are "Aufgabengeneratoren" # # e.g. on this topic overview: http://www.zum.de/dwu/umamgl.htm # # If we really wanted to handle the 3 .xls links, we need an additional xls-specific parse method # xls_set.add(url) # self.debug_xls_set.add(url) elif url.startswith("javascript"): # in some sections there are topics that lead to a javascript href, e.g. # "javascript:infowin('infodep/i-lingleich.htm');" # we'll have to extract the .htm-link from that string to parse it: the starting ' is our delimiter js_regex = re.compile(r"([^']*.htm)") js_url = js_regex.search(url) js_url = js_url.group() # url_set.add(js_url) current_url = response.urljoin(js_url) url_set.add(current_url) # print("debug XLS set length:", len(self.debug_xls_set)) # print(self.debug_xls_set) for url in url_set: # only yield a scrapy Request if the url hasn't been parsed yet, this should help with duplicate links # that are found across different topics if url not in self.parsed_urls: yield scrapy.Request(url=url, callback=self.parse) self.parsed_urls.add(url)
def parse_search_result(self, response: scrapy.http.Response): tbl = response.xpath( "//table[@class='table table-striped table-hover table-bordered']") for rowidx, row in enumerate(tbl.xpath("./tr")): if rowidx == 0: continue obj = {} for idx, col in enumerate(row.xpath("./td")): if idx == 0: rawdate = "".join(col.xpath("./text()").getall()).strip() rawdate = ' '.join(rawdate.split()) rawdate = rawdate.strip() rem = re.split(r"^(\d+)\s+/(\d+) (\d+)\.(\d+)\.(\d+)$", rawdate)[1:] rem.pop() vhnum, vhyear, pday, pmonth, pyear = rem obj["date"] = f"{vhyear}-{vhnum.zfill(3)}__{pyear}-{pmonth.zfill(2)}-{pday.zfill(2)}" elif idx == 1: for link in col.xpath("./a"): txt = link.xpath("./text()").get().strip() url = response.urljoin(link.xpath("./@href").get()) if txt == '0 kpl': continue if 'title' not in obj: obj["title"] = txt obj["link"] = url else: obj["attach"] = url dirpath = os.path.join(self.name, ) if "attach" in obj: yield scrapy.Request( obj["attach"], meta={ "name": response.meta["name"], "id": obj["date"], }, callback=self.parse_attachments, ) yield scrapy.Request( obj["link"], meta={ "name": response.meta["name"], "id": obj["date"], }, callback=self.dl_doc, )
def parse( self, response: scrapy.http.Response ) -> typing.Generator[scrapy.Request, None, None]: """Find all the cases.""" for case_url in response.xpath('//table[@class="cases"]/tbody/tr/td/a/@href'): url = response.urljoin(case_url.extract()) yield scrapy.Request( url=url, callback=self.parse_case, dont_filter=True, )
def parse(self, response: scrapy.http.Response, **kwargs): section: list = response.css("ul.bread-crumbs").css( "span::text").getall() for product in response.css('div.dtList.i-dtList.j-card-item'): product_ref = product.css( "a.ref_goods_n_p.j-open-full-product-card::attr(href)").get() prod_card_url = response.urljoin(product_ref) yield common_request(url=prod_card_url, callback=self.parse_product_card, meta={'section': section}) if self.small_sample: break next_page_ref = response.css("a.pagination-next::attr(href)").get() if next_page_ref is not None: next_page_url = response.urljoin(next_page_ref) yield common_request(url=next_page_url, callback=self.parse, meta={'section': section})
def parse_section_overview(self, response: scrapy.http.Response): # Each section (e.g. "Mathematik Teilgebiete") holds a list of individual topic-categories (e.g. "Kreislehre") section_urls = response.xpath( '/html/body/table/tr/td/a/@href').getall() section_urls.sort() # print(section_urls) # print("Section URLs: ", len(section_urls)) for url in section_urls: current_url = response.urljoin(url) yield scrapy.Request(url=current_url, callback=self.parse_topic_overview)
def parse(self, response: scrapy.http.Response): hrefs = response.css('div.tours > a::attr(href)').extract() attractionNumber = 1 for href in hrefs: href = response.urljoin(href) self.log("visiting: " + href) meta = urlToCityAndCountryMapping[response.url] meta['rank'] = attractionNumber yield response.follow(href, callback=self.parseAttractionsPage, meta=meta) attractionNumber += 1
def parse_main( self, response: scrapy.http.Response ) -> Union[Iterator[Issue], scrapy.http.Request]: links = ( response.css("font.hdr b")[-1].xpath("../../../../../../*")[-1]. xpath('.//td[@valign="top"]').xpath(".//a[not(@hidden)][@href]")) for link in links: href = link.attrib["href"] if href.endswith(".pdf") or href.endswith(".djvu"): yield Issue(file=response.urljoin(href), text=link.css("::text").get()) else: yield response.follow(url=href, callback=self.parse_page)
def getReviews(self, response: scrapy.http.Response): self.log("review method called") reviewCount = 0 reviewsUrl = response.css('div.quote.isNew > a::attr(href)').extract() for url in reviewsUrl: url = response.urljoin(url) self.log("review url: " + url) yield scrapy.Request(url, callback=self.parseReviewsPage, meta=response.meta) reviewCount += 1 nextPageLink = response.css( 'div.collapsedReviewsList > div > div > a::attr(href)').extract() if len(nextPageLink) == 2: newPageUrl = nextPageLink[1] newPageUrl = response.urljoin(newPageUrl) if reviewCount < 25: yield scrapy.Request(url=newPageUrl, callback=self.getReviews, meta=response.meta)
def parse_vol(self, response: scrapy.http.Response): vol = VolItem() vol['vol_id'] = response.meta['id'] vol['comic_id'] = response.meta['comic_id'] vol['images'] = response.css('select#page-selector')[0].css( 'option::attr(value)').getall() vol['images'] = [response.urljoin(url) for url in vol['images']] yield vol for i, url in enumerate(vol['images'], start=1): request = scrapy.Request(url, callback=self.parse_page) request.meta['comic_id'] = vol['comic_id'] request.meta['vol_id'] = vol['vol_id'] request.meta['page'] = i yield request
def parse_vacancies_links(self, response: scrapy.http.Response) -> \ scrapy.http.Request: """ This method gets the links of the vacancies listed in the response, requests its own response and calls the ´´parse_vacancy_contents´´ method for each of them to parse its data. :param response: Scraped response of the listing page :return: Request of parsing the contents of each listed vacancy """ # self.logger.info('Processing listing page: %s', response.url) for href in response.xpath( "//section[@class='c-jobsearchpage__content']" "//div[@class='c-jobcarousel__slider--title']" "//a/@href").getall(): yield scrapy.Request(response.urljoin(href), self.parse_vacancies_contents)
def parse_attachments(self, response: scrapy.http.Response): for link in response.xpath("//a"): href = link.xpath("./@href").get() q = dict(queryparse(urlsplit(href).query)) if not q: continue if ('doctype' in q) or ('docid' in q): yield scrapy.Request( response.urljoin(href), meta={ "name": response.meta["name"], "id": response.meta["id"], }, callback=self.dl_doc, )
def parse_search_result(self, response: scrapy.http.Response): for link in response.xpath("//a"): href = link.xpath("./@href").get() q = dict(queryparse(urlsplit(href).query)) if not q: continue if ' bid' in q: yield scrapy.Request( response.urljoin(href), meta={ "name": response.meta["name"], "dont_cache": True, }, callback=self.parse_bid, )
def parse_series(self, response: scrapy.http.Response): """ Series-specific CPU list such as Atom CPUs :param response: :return: """ # Find Products Home > Product Specifications > Processors breadcrumb if response.xpath("//a[contains(@class, 'hidden-crumb-xs')]/text()").get().strip() != "Processors": raise scrapy.exceptions.CloseSpider("Processors not found in crumb") for link in response.xpath("//tr/td/a/@href"): if link.root.find("/products/") == -1: self.logger.error("product not found from link, skipping") continue yield scrapy.Request(response.urljoin(link.root), callback=self.parse_specs)
def parse_topic_overview(self, response: scrapy.http.Response): """ Looks for individual topics within the overview and yields the URL to the main parse()-method. :param response: the current 'url' from start_urls :return: scrapy.Request Scrapy Contracts: @url https://www.walter-fendt.de/html5/phde/ @returns requests 50 """ # the different topics are within tables: response.xpath('//table[@class="Gebiet"]') topic_urls = response.xpath('//td[@class="App"]/a/@href').getall() for topic_url in topic_urls: topic_url = response.urljoin(topic_url) yield scrapy.Request(url=topic_url, callback=self.parse)
def parse_database( self, response: scrapy.http.Response ) -> typing.Generator[scrapy.Request, None, None]: """Find all the years in each database.""" for year_url in response.xpath( '//div[@class="year-specific-options year-options"]/ul/li/h5'): url = response.urljoin(year_url.xpath("./a/@href").extract()[0]) text = year_url.xpath("./a/text()").extract()[0] yield scrapy.Request( url=url, callback=self.parse_years, meta={ DATABASE_KEY: response.meta[DATABASE_KEY], YEAR_KEY: text, }, dont_filter=True, )
def parse_index_page(self, resp: scrapy.http.Response) -> None: ensure_response_200(resp) names = resp.xpath('//ul[@class = "name-list"]/li/a/@href').extract() for name_url in names: yield NameUrl(url=resp.urljoin(name_url)) page_count = extract_page_count(resp) curr_page = resp.meta['curr_page'] if curr_page < page_count: curr_letter = resp.meta['letter'] yield Request( dir_url_for_letter(curr_letter, curr_page + 1), callback=self.parse_index_page, meta={ 'letter': curr_letter, 'curr_page': curr_page + 1 }, )
def parse( self, response: scrapy.http.Response ) -> typing.Generator[scrapy.Request, None, None]: """Find all the databases.""" for db_url in response.xpath('//div[@class="card"]/ul/li'): relative_url = db_url.xpath("./a/@href").extract() if not relative_url: continue url = response.urljoin(relative_url[0]) text = db_url.xpath("./a/text()").extract()[0] yield scrapy.Request( url=url, callback=self.parse_database, meta={ DATABASE_KEY: text, }, dont_filter=True, )
def parse_comments( self, response: scrapy.http.Response) -> Generator[dict, None, None]: """Takes in a response from a comment thread page (e.g. https://beta4v.mydramalist.com/v1/threads?&c=title&t=9025&page=1) and yields. Also yields a request to the next comment page, if more comments do exists. """ data = json.loads(response.body) show_id = parse_qs(response.url)['t'][0] data['show_id'] = show_id data['url'] = response.url yield data if data['has_more']: parts = response.url.split('=') parts[-1] = str(int(parts[-1]) + 1) next_url = '='.join(parts) yield scrapy.Request(response.urljoin(next_url), callback=self.parse_comments)
def parse(self, response: scrapy.http.Response): boarddata_tpl = {} for header in response.xpath("//table[@class='display']/thead/tr/th"): name = header.xpath("text()").get().strip() boarddata_tpl[name] = None for board in response.xpath("//table[@class='display']/tbody/tr"): tmp = {} for idx, key in enumerate(boarddata_tpl): info = "".join( board.xpath(f"./td[{idx + 1}]//text()").getall()).strip() tmp[key] = info yield scrapy.Request( response.urljoin( f"/en/products/motherboard/{tmp['Motherboard']}"), callback=self.parse_motherboard, )
def parse_case( self, response: scrapy.http.Response ) -> typing.Generator[scrapy.Request, None, None]: """Find all the documents in the case.""" item_full_text_div = response.xpath('//div[@class="itemFullText"]')[0] case_name = item_full_text_div.xpath('./h2/text()')[0].extract().strip() case_number = response.xpath('//h1[@class="itemTitle"]/text()')[0].extract().strip() for paragraph in item_full_text_div.xpath('./p'): paragraph_text = paragraph.xpath('string(.)').extract()[0] link = paragraph.xpath('./a') if not link: continue document_url = link[-1].xpath('./@href')[0] url = response.urljoin(document_url.extract()) date = paragraph_text.split()[0] try: parse(date) except: continue entity_name = '' entity_class = '' braces = re.findall(r"\(.*?\)", paragraph_text) if braces: brace_text = braces[0].replace("(", "").replace(")", "") braces_split = [x.strip() for x in brace_text.split("-")] entity_name = braces_split[-1] if len(braces_split) > 1: entity_class = braces_split[0] document_name = link[-1].xpath('./text()').extract() yield scrapy.Request( url=url, callback=self.parse_document, meta={ CASE_NAME: case_name, CASE_NUMBER: case_number, ENTITY_NAME: entity_name, ENTITY_CLASS: entity_class, DOCUMENT_NAME: document_name, DATE: date, }, )