def parse(self, response: HtmlResponse, **kwargs): url = 'https://www.news.gov.hk/jsp/NewsArticle.jsp' new_url = 'https://sc.news.gov.hk/TuniS/www.news.gov.hk/jsp/NewsArticle.jsp' category_list = [ 'finance', 'school_work', 'health', 'environment', 'law_order', 'infrastructure', 'admin', 'city_life', 'record' ] language_list = ['eng', 'chi'] params = { 'language': 'chi', 'category': 'finance', 'date': '', } for date in get_date(): for category in category_list: if date == '202102': break for language in language_list: params['date'] = date params['language'] = language params['category'] = category yield response.follow(url=url + '?' + urlencode(params), callback=self.get_news_list) params['language'] = 'chi' yield response.follow(url=new_url + '?' + urlencode(params), callback=self.get_news_list)
def parse_list_of_tasks(self, response: HtmlResponse, max_number=0, next_number=0, step=5, subsection:str = ''): task_urls = response.css('.problemsmallnumber .componentboxlink::attr(href)').extract() for task_url in task_urls: callback = partial(self.parse_task, subsection=subsection) yield response.follow(response.urljoin(task_url), callback=callback) if next_number < max_number: url = set_get_parameter(response.url, 'start', next_number) callback = partial(self.parse_list_of_tasks, max_number=max_number, next_number=next_number + step, subsection=subsection) yield response.follow(url, callback=callback)
def parse(self, response: HtmlResponse): articleLinks = get_article_links(response) # for articleLink in articleLinks: # yield response.follow(articleLink, parse_article) #comments in first page for articleLink in articleLinks: yield response.follow(articleLink, parse_comment) commentLinks = get_comment_links(response, self) for commentLink in commentLinks: yield response.follow(commentLink, parse_comment)
def parse_region(self, response: HtmlResponse): """Parse regions. Nordbayern -> Frankenjura Nord Example: https://www.frankenjura.com/klettern/region/2 """ item = SectorItem() item["name"] = response.meta["region_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["description"] = response.css( 'div[class="location-head"]+p ::text').get() yield item region = item.django_model.objects.get(**item) sub_regions = response.css('div[class="column"]').css( 'a[href*="region"]') for sub_region in sub_regions: meta = { "sub_region_name": sub_region.css("::text").get(), "parent": region } yield response.follow(sub_region, self.parse_sub_region, meta=meta)
def get_list(self, response: HtmlResponse): for url in re.findall(r'(?<=productDetailUrl":"//).*?(?=")', response.text): for language in self.languages: yield response.follow(url=re.sub(r'www.*?com', 'https://' + language, url), callback=self.get_item)
def parse_wall(self, response: HtmlResponse): """Parse walls. ... -> Region Wattendorf -> Falkenwand Example: https://www.frankenjura.com/klettern/poi/21 """ item = SectorItem() item["name"] = response.meta["wall_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url item["internal_rating"] = _parse_stars(response) item["max_height_in_m"] = _parse_wall_max_height(response) item["rain_protected"] = _parse_rain_protected(response) item["child_friendly"] = _parse_child_friendly(response) item["description"] = _parse_wall_description(response) item["approach"] = _parse_wall_approach(response) item["approach_road"] = _parse_wall_approach_road(response) item["fk_orientation"] = _parse_orientation(response) item["latitude"], item["longitude"] = _parse_lat_lon(response) yield item wall = item.django_model.objects.get(name=item["name"], fk_sector=item["fk_sector"]) routes = response.css('div[class="poi-link-container"]').css("a") for route in routes: meta = {"route_name": route.css("::text").get(), "parent": wall} yield response.follow(route, self.parse_route, meta=meta)
def parse_category(self, response: HtmlResponse) -> HtmlResponse: """ List category and traverse product pages. """ products_query = response.css( "section#bc-sf-filter-products > div.product-grid-item") if not products_query: raise IgnoreRequest('Product items not found') self.logger.info( f'parse product_categories len: {len(products_query)}') for pdp in products_query.css('div.product-grid-item'): item_loader = ProductLoader(item=UrgeItem(), selector=pdp) item_loader.add_css('product_name', 'div.product-text > p.title::text') item_loader.add_css('product_brand', 'div.product-text > h2.vendor.h5::text') # get regular product price through OR (,). item_loader.add_css( 'product_price', 'div.product-text p.price s::text , span[itemprop="price"]::text' ) item_loader.add_css( 'product_sale_price', 'div.product-text p.sale span[itemprop="price"]::text') if 'href' in pdp.css('a').attrib: product_url = pdp.css('a').attrib['href'] yield response.follow(product_url, callback=self.product_page, meta={'item': item_loader.load_item()})
def parse(self, response: HtmlResponse, **kwargs): url = 'https://pcsupport.lenovo.com/us/zh/products/' with open('Khala/spider_params/lenovo/lenovo.txt', 'r+') as model_list: for model in model_list: model = model.replace('\n', '') yield response.follow(url=url + model, callback=self.follow_index, meta={'model': model})
def get_news_list(self, response: HtmlResponse): text = response.text url = 'https://www.news.gov.hk' if 'sc.news.gov.hk' in response.url: url = 'https://sc.news.gov.hk/TuniS/www.news.gov.hk' for path in re.findall( r'(?<=<generateHtmlPath>).*?(?=</generateHtmlPath>)', text): yield response.follow(url=url + path, callback=self.get_news)
def get_contents_list(self, response: HtmlResponse): meat = response.meta contents_list = response.json().get('list') with open('Khala/spider_params/lenovo/language.txt', 'r+') as languages: for language in languages: language = language.replace('\n', '') for contents in contents_list: url = f'https://pcsupport.lenovo.com/us/{language}/products/{meat["model"]}/solutions/{contents["docid"]}' yield response.follow(url=url, callback=self.out_item)
def get_count(self, response: HtmlResponse): headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' } html = etree.HTML(response.text) count = int(html.xpath('string(//span[@id="documentCount"])')) url = re.sub(r'\?.*', 'resultViewCahnge/resultListAjax', response.url) for i in range(1, int(count / 50) + 2): data = 'data=langFilterDisabled%3Dtrue_COMMA_sortByField%3DRelevance_COMMA_' \ f'pageNumber%3D{i}_COMMA_itemsPerPage%3D50' yield response.follow(headers=headers, url=url, method='post', body=data, callback=self.get_pdf_list)
def parse_table_of_context(self, response: HtmlResponse): ul = response.css('ul.componentboxlist') themes = {text.strip(): [url, 0] for text, url in zip(ul.css('a::text').extract(), ul.css('a::attr(href)').extract())} titles = ul[0].css('::text').extract() for i, s in enumerate(titles): s = s.strip() if s in themes: s_n = titles[i + 1] n = int(s_n.strip().split()[0][1:]) themes[s][1] = n for subsection, (url, n) in themes.items(): url = set_get_parameter(response.urljoin(url), 'start', 0) callback = partial(self.parse_list_of_tasks, max_number=n, next_number=5, subsection=subsection) yield response.follow(url, callback=callback)
def parse(self, response: HtmlResponse): """Parse webpage to extract important recipe information""" if response.css(".wprm-recipe-ingredients-container"): data = { "name": response.css(".title::text").get(), "source_id": self.sid, "url": response.url, "image": self.get_image_url(response), "ingredients": self.get_ingredients(response), } if all(val is not None for val in data.values()): resp = requests.post(self.endpoint, json=data) if resp.status_code == 400: raise CloseSpider("Recipe already exists") for anchor_tag in response.css(".nav-previous a"): yield response.follow(anchor_tag, callback=self.parse)
def parse(self, response: HtmlResponse, **kwargs): params = { "trafficChannel": "main", "catName": "t-shirts", "CatId": "100003127", "ltype": "wholesale", "SortType": "default", "page": "1" } with open('Khala/spider_params/aliexpress/categories.json', 'r') as file: categories = json.loads(file.read()) for k, v in categories.items(): url = f'https://www.aliexpress.com/category/{k}/{v}.html' params['CatId'] = k params['catName'] = v for page in range(1, 61): params['page'] = str(page) yield response.follow(url=url + '?' + urlencode(params), callback=self.get_list)
def parse(self, response: HtmlResponse): self.start_index += 20 index = self.start_index for box in response.xpath('//div[@class="info"]'): item = MyProjectItem() title = box.xpath('./h2/a//text()').get() description = box.xpath("./p/text()").get() rate = box.xpath( './div[@class="star clearfix"]/span[2]/text()').get() item['title'] = title.replace( '\n', '').strip() if type(title) == str else '' item['rate'] = rate.replace( '\n', '').strip() if type(rate) == str else '0' item['description'] = description.replace( '\n', '').strip() if type(description) == str else '' infos = box.xpath('./div[1]/text()').get().split('/') item['price'] = infos[len(infos) - 1].replace('\n', '').strip() yield item yield response.follow( 'https://book.douban.com/tag/编程?start=%d&type=T' % index, self.parse)
def follow_index(self, response: HtmlResponse): meta = response.meta url = 'https://pcsupport.lenovo.com/us/zh/api/v4/contents/productcontentslist' params = { "pids": "", "top": "0", "types": "MSH,KB,Forum.KB,TS,PS,LPDT", "countries": "us", "language": "zh" } text = response.text try: Guid = re.search(r'(?<=Guid":").*?(?=")', text).group(0) ParentGuids = re.search(r'(?<=ParentGuids":\[").*?(?="])', text).group(0).replace('"', '') except AttributeError: return params['pids'] = Guid + ',' + ParentGuids yield response.follow(url=url + '?' + urlencode(params), callback=self.get_contents_list, meta=meta)
def parse_sub_region(self, response: HtmlResponse): """Parse sub regions. ... -> Frankenjura Nord -> Region Wattendorf Example: https://www.frankenjura.com/klettern/region/8 """ item = SectorItem() item["name"] = response.meta["sub_region_name"] item["fk_sector"] = response.meta["parent"] item["source"] = response.url yield item sub_region = item.django_model.objects.get(**item) walls = response.css("td").css("a") for wall in walls: meta = { "wall_name": wall.css("::text").get(), "parent": sub_region } yield response.follow(wall, self.parse_wall, meta=meta)
def parse(self, response: HtmlResponse): """Parse main page https://www.frankenjura.com/klettern/panorama. Region Nordbayern Creates main sector 'Frankenjura' and hook it in the database. """ # prepare base item as hook-in country = Sector.objects.get(name="Germany") base_item = SectorItem() base_item["name"] = "Frankenjura" base_item["fk_sector"] = country yield base_item frankenjura = base_item.django_model.objects.get(**base_item) regions = response.css('div[class="column"]').css('a[href*="region"]') for region in regions: meta = { "region_name": region.css("::text").get(), "parent": frankenjura } yield response.follow(region, self.parse_region, meta=meta)
def parse(self, response: HtmlResponse): articleLinks = get_article_links(response) for articleLink in articleLinks: yield response.follow(articleLink, self.parse_article)
def parse(self, response: HtmlResponse, **kwargs): text = response.text html = etree.HTML(text) for html_href in html.xpath("//div[@class='letterBox clearfix']//a/@href"): yield response.follow(url=html_href + '?langFilterDisabled=true', callback=self.get_count)