def parse(self, response: Response): if response.status == 200: lis = response.css('.all-img-list li') print(f'------{len(lis)}-------------') for li in lis: item = BookItem() item['book_id'] = uuid.uuid4().hex a = li.xpath('./div[1]/a') item['book_url'] = a.xpath('./@href').get() item['book_cover'] = a.xpath('./img/@src').get() item['book_name'] = li.xpath('./div[2]/h4//text()').get() item['author'], *item['tags'] = li.css( '.author a::text').extract() item['summary'] = li.css('.intro::text').get() # 请求小说的详情 yield Request('https:' + item['book_url'], callback=self.parse_info, priority=1, meta={'book_id': item['book_id']}) yield item # 获取下一页 next_url = response.css('.lbf-pagination-item-list ').xpath( './li[last()]/a/@href').get() if next_url.find('javascript') == -1: yield Request('https:' + next_url, priority=100)
def parse(self, response: Response): log('parse') if response.css('#leftNav a > i.a-star-medium-4'): log('go star') yield Request(response.url, self.star_4, dont_filter=True, errback=self.errors('star_4')) else: links = response.css( 'div.left_nav.browseBox a::attr(href)').getall() for link in links: node = get_query_val(link, 'node') if node: url = f'{amazon_url}/s?node={node}' yield Request(url, self.parse, errback=self.errors('parse')) else: rh = get_query_val(link, 'rh') if rh: url = f'{amazon_url}/s?rh={rh}' yield Request(url, self.parse, errback=self.errors('parse'))
def _wikidata_info(response: Response): property_codes = [{ 'name': 'date_of_birth', 'code': 'P569' }, { 'name': 'date_of_death', 'code': 'P570' }, { 'name': 'place_of_birth', 'code': 'P19', 'link': True }, { 'name': 'place_of_death', 'code': 'P20', 'link': True }, { 'name': 'gender', 'code': 'P21', 'link': True }] winner_info = {} for prop in property_codes: if prop.get('link'): sel = response.css(f'#{prop["code"]} .wikibase-snakview-value' f' a::text') else: sel = response.css(f'#{prop["code"]} .wikibase-snakview-value' f'::text') if sel: if not sel.get(): print(f'No field {prop["name"]} for {response.url}') winner_info[prop['name']] = sel.extract_first() yield NWinnerItem(**winner_info, **response.meta['winner'])
def single_category(self, response: Response): log('single') if get_query_val(response.url, 's', '') != 'review-rank': yield Request(update_query(response.url, s='review-rank'), self.single_category, dont_filter=True, errback=self.errors('single_category')) else: links = response.css( 'div.s-result-list.s-search-results.sg-row > div[data-asin] h2 > a::attr(href)' ).getall() for link in links: dirs = link.split('/') dp = dirs[dirs.index('dp') + 1] url = f'{amazon_url}/dp/{dp}' yield Request(url, self.single_parse, errback=self.errors('single_parse')) page = int(get_query_val(response.url, 'page', 1)) if page < max_page: next_link = response.css( 'ul.a-pagination > li.a-last > a::attr(href)').get() if next_link: yield Request(remove_query(amazon_url + next_link, 's', 'rh', 'page'), self.single_category, errback=self.errors('single_category'))
def parse_sku(self, response: Response): other_color_urls = [ self.base_url + item.attrib['href'] for item in response.css('div.color-picker__swatches a') ] for url in other_color_urls: yield Request(url, callback=self.parse_sku) price_hkd = response.css( 'div.product-info-panel__price::text').get().strip('HKD').replace( ',', '') price = { 'hkd': float(price_hkd), } json_in_page = response.css( 'script[type="application/ld+json"]::text').getall()[-1] json_data = json.loads(json_in_page) code = json_data['sku'] sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]: yield from self.follow_pages(response) for recipe_url in response.css("a.promo::attr(href)"): yield response.follow(recipe_url.get(), callback=self.parse) recipe = response.css("div.recipe-main-info") if recipe: ingredients = [ self._get_ingredient(response, ingredient) for ingredient in recipe.css("li.recipe-ingredients__list-item") ] if all(ingredient["url"] is not None for ingredient in ingredients): chef_name_parts = recipe.css(".chef__name *::text").getall() chef_name = chef_name_parts[-1] if len( chef_name_parts) > 0 else None image_urls = recipe.css( ".recipe-media__image img::attr(src)").getall() yield { "title": recipe.css("h1::text").get(), "url": response.url, "chef_name": chef_name, "ingredients": ingredients, "image_urls": image_urls, }
def parse(self, response: Response): """获取小说信息""" items = BookItem() if response.status == 200: # print(response.text) lis = response.css('.all-img-list li') for li in lis: items['book_id'] = uuid.uuid4().hex items['book_url'] = li.xpath('./div/a/@href').get() items['book_name'] = li.xpath('./div/h4//text()').get() items['author'], *items['tags'] = li.css( '.author a::text').extract() items['description'] = li.xpath('./div/p[2]//text()').get() items['img'] = li.css('.book-img-box img::attr("src")').get() yield Request('https://' + items['book_url'] + '#Catalog', callback=self.parse_info, priority=100, meta={'book_id': items['book_id']}) yield items # 获取下一页的链接 next_url = response.css('.lbf-pagination-item-list').xpath( './li[last()]/a/@href').get() if next_url.find('javascript') == -1: # priority高的优先执行 yield Request('https:' + next_url, priority=1)
def parse_video_page(self, response: Response): item = VideoItem() item['up_name'] = response.css('.name a::text').extract_first() item['title'] = response.css('.tit::text').extract_first() aid = response.url.replace("https://www.bilibili.com/video/av", "") item['aid'] = aid.replace("/", "") timestamp = int(round(time.time() * 1000)) header = { "Host": "api.bilibili.com", "Origin": "https: // www.bilibili.com", "Referer": f"https://www.bilibili.com/video/av{item['aid']}/", "USER-AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 'Accept-Language': 'en-US,en;q=0.5', 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', } yield Request( url= f"http://api.bilibili.com/archive_stat/stat?callback=&aid={item['aid']}&type=json&_={timestamp}", dont_filter=False, headers=header, callback=self.parse, # item对象传递给下个函数 meta={"item": item})
def parse(self, response: Response): # 下面这个数字用来计算端口号 data_num = response.css('html').re_first( r'<div style="display:none" data-[a-zA-Z]*="(\d+)"') for row in response.css('table.table-hover tbody tr'): loader = ItemLoader(item=IPItem(), selector=row) loader.add_value('source', 'dbproxy') loader.add_css('remark', 'td:nth-child(4) div::text') loader.add_css('protocol', 'td:nth-child(5)::text') # 解密IP script_elem = row.css('td:nth-child(1) script::text') ip_first_part = script_elem.re_first(r'\'([\d\.]*)\'\.split') ip_first_part = ''.join(reversed(ip_first_part)) hex_list = script_elem.re(r'\\x([A-Za-z0-9]{2})') b64_string = bytearray.fromhex(''.join(hex_list)).decode() ip_second_part = base64.b64decode(b64_string).decode() loader.add_value('ip', ip_first_part + ip_second_part) # 解密port raw_port = script_elem.re_first(r'var pp = \((\d+) -') loader.add_value('port', int(raw_port) + int(data_num)) yield loader.load_item()
def parse(self, response: Response): if response.status == 200: # 解析数据 lis = response.css('.all-img-list li') for li in lis: item = BookItem() item['book_id'] = uuid.uuid4().hex # li 对象类型是Selector 没有x()函数 a = li.xpath('./div[1]/a') item['book_url'] = a.xpath('./@href').get() item['book_cover'] = a.xpath('./img/@src').get() item['book_name'] = li.xpath('./div[2]/h4//text()').get() item['author'], *item['tags'] = li.css( '.author a::text').extract() item['summary'] = li.css('.intro::text').get() # 请求小说内容 yield Request('https://' + item['book_url'], callback=self.parse_info, priority=10, meta={'book_id': item['book_id']}) yield item # 寻找下一页的标签 next_url = response.css('.lbf-pagination-item-list').xpath( './li[last()]/a/@href').get() if next_url.find('javascript') == -1: # 存在下一页 yield Request('https:' + next_url, priority=100) # 优先级值越高,会优先下载
def follow_pages(self, response: Response) -> Iterable[Request]: yield from ( response.follow(letter.get(), callback=self.parse) # type: ignore for letter in response.css(".az-keyboard ul li a::attr(href)")) yield from ( response.follow(number.get(), callback=self.parse) # type: ignore for number in response.css("ul.pagination__list li a::attr(href)"))
def parse_pcexpect(self, response: Response): """コンピュータ予想""" item = response.meta["item"] focuses = response.css(".numberSet2_row").xpath("string()").getall() focuses = list(map(lambda x: "".join(x.split()), focuses)) item["predict_patterns"] = focuses item["predict_confidence"] = response.css( ".state2 .state2_lv::attr('class')").re("is-lv(\d)")[0] marks = response.css( ".table1 .is-fs12 tr:first-child td:first-child").getall() for i, mark in enumerate(marks): match = re.search(r"icon_mark1_(\d+)\.png", mark) if match: item["racers"][i]["predict_mark"] = match[1] else: item["racers"][i]["predict_mark"] = None # 結果の取得へ url = self.get_url( response.css("ul.tab3_tabs li:nth-child(6) a::attr('href')").get()) yield scrapy.Request( url=url, callback=self.parse_result, meta={"item": item}, )
def parse_odds3t(self, response: Response): def convert(odds): try: return float(odds) except: return 0 """3連単""" item = response.meta["item"] odds = response.css("td.oddsPoint::text").getall() patterns = transpose(permutations(range(1, 7), 3), 20) item["trifecta"] = { "-".join(map(str, p)): convert(o) for p, o in sorted(zip(patterns, odds), key=lambda x: x[0]) } # 直前情報の取得へ url = self.get_url( response.css("ul.tab3_tabs li:nth-child(3) a::attr('href')").get()) yield scrapy.Request( url=url, callback=self.parse_beforeinfo, meta={"item": item}, )
def handle_page(self, response: Response) -> TorrentFileItem: torrents = response.css( 'a[href^="forum.php?mod=attachment"]:contains("torrent")::attr(href)' ).extract() page_links = response.css( 'a[href^="imc_attachad-ad.html"]:contains("torrent")::attr(href)' ).extract() if len(torrents) < 1 and len(page_links) < 1: return for torrent in torrents: request = DownloadRequest( url=response.urljoin(torrent), # relative url to absolute callback=self.handle_item) request.meta['from_url'] = response.url yield request regex = re.compile(r'aid=(\w+)') for page_link in page_links: match = regex.search(page_link) if not match: continue id = match.group(1) request = DownloadRequest( url=response.urljoin('forum.php?mod=attachment&aid=%s' % id), # relative url to absolute callback=self.handle_item, dont_filter=True) request.meta['from_url'] = response.url yield request
def _country(response: Response): h3s = response.css('h3') ols = response.css('h3+ol') for h3, ol in zip(h3s, ols): country = h3.css('span.mw-headline::text').get() if country: yield (country, ol)
def parse_sku(self, response: Response): code = response.css('span.infoProduct__button::text').get() price_eur = response.css('span.pdpData__price::text').get().strip().strip('€ ').replace(',', '') price = { 'eur': float(price_eur) } sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse_sku(self, response: Response): attrs = [] code = response.css('h2.iwc-buying-options-reference::text').get() name = response.css('h3.iwc-buying-options-title::text').get().strip() description = '<br>'.join( response.css( 'ul[data-toggle-id="showDetails1"] li.iwc-product-detail-item::text' ).getall()) tracking_product = json.loads( response.css('button[data-tracking-product]'). attrib['data-tracking-product']) cny = tracking_product['price'] price = {'cny': float(cny)} image_elements = response.css('div.rcms_productPageThumbnails') image_urls = [item.attrib['data-src'] for item in image_elements] image_urls = [self.base_url + url for url in image_urls] image_urls = [re.sub(r'\.transform.+', '', url) for url in image_urls] compositions_one = [ item.strip() for item in response.css( 'ul[data-toggle-id="showDetails0"] li::text').getall() if len(item.strip()) ] for s in compositions_one: s = s.strip().replace('\n', '') attrs.append({'name': '表壳', 'value': s}) compositions_three = [ item.strip() for item in response.css( 'ul[data-toggle-id="showDetails2"] li::text').getall() if len(item.strip()) ] for s in compositions_three: s = s.strip().replace('\n', '').replace(' ', '') attrs.append({'name': '机芯', 'value': s}) compositions_four = [ item.strip() for item in response.css( 'ul[data-toggle-id="showDetails3"] li::text').getall() if len(item.strip()) ] for s in compositions_four: s = s.strip().replace('\n', '') attrs.append({'name': '表带', 'value': s}) compositions_two = [ item.strip() for item in response.css( 'ul[data-toggle-id="showDetails4"] li::text').getall() if len(item.strip()) ] for s in compositions_two: s = s.strip().replace('\n', '') attrs.append({'name': '表盘', 'value': s}) sku = SKU(self.brand_name, '', '', code, name, response.url, price, description, image_urls, attrs) yield sku
def parse_sku(self, response: Response): code = response.css('div.itemInfo-modelfabricolor span.value::text').get() eur = response.css('span.price span.value::text').get().strip('€ ').replace('.', '').replace(',', '.') price = { 'eur': float(eur) } sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse_sku(self, response: Response): code = response.css('div.product-number div::text').get() price_usd = response.css('span.price-sales::text').get() if price_usd is not None and len(price_usd) > 1: price_usd = price_usd.strip('$').replace(',', '') price = { 'usd': float(price_usd), } sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', {}, {}) yield sku
def parse(self, response: Response): params = parse_query_params(response.url) stadium_urls = response.css("td.is-alignL a::attr('href')").re( r".*raceindex.*") self.logger.info(f"{params['hd']}: {len(stadium_urls)} stadiums") for stadium_url in stadium_urls: yield scrapy.Request(url=self.get_url(stadium_url), callback=self.parse_stadium) # さらに前日へ url = response.css("li.title2_navsLeft a::attr('href')").get() yield scrapy.Request(url=self.get_url(url), callback=self.parse)
def handle(self, response: Response) -> Result: price = ( response.css("div.priceView-hero-price") .css(".priceView-customer-price") .css("span[aria-hidden=true]::text") .get() ) availability = ( response.css("button.add-to-cart-button::text").get().lower() == "add to cart" ) return Result(url=response.url, price=price, availability=availability)
def parse_sku(self, response: Response): price_hkd = response.css('span.sales::text').get().strip().strip('HK$').replace(',', '') price = { 'hkd': float(price_hkd) } composition = response.css('ul.list.fs-s.ff-light li::text').getall() code = composition[0].strip().strip('參考編號').strip().strip(':').strip() sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse_sku(self, response: Response): attrs = [] image_urls = [] name = response.css('h1.product-purchase_name::text').get() price_cny = response.css('span.product-purchase_price::text').get().strip('¥').replace(',', '') price = { 'cny': float(price_cny), } color = response.css('li[data-type="colour"] span.product-purchase_selected::text').get() attrs.append({ 'name': '颜色', 'value': color, }) # 爬取其他颜色 other_color_urls = [self.base_url + item.attrib['href'] for item in response.css('li[data-type="colour"] div.product-purchase_options-labels a')] for url in other_color_urls: yield Request(url, callback=self.parse_sku) description = response.css('div.accordion-tab_content p::text').get() attrs_in_page = response.css('div.accordion-tab_sub-item li::text').getall() for attr in attrs_in_page: parts = attr.split(':', 1) n = '参数' v = parts[0] if len(parts) > 1: n = parts[0] v = parts[1] attrs.append({ 'name': n, 'value': v, }) if response.css('span[data-label="选择尺码"]').get() is not None: sizes = response.css('li[data-type="size"] div.product-purchase_options label::text').getall() attrs.append({ 'name': '尺码', 'value': ','.join(sizes) }) image_elements = response.css('div.product-carousel_item noscript picture img') image_urls = ['https://' + (item.attrib['src'].strip('//')) for item in image_elements] code = response.css('p.accordion-tab_item-number::text').get().strip("商品 ") sku = SKU(self.brand_name, '', '', code, name, response.url, price, description, image_urls, attrs) yield sku
def parse_sku(self, response: Response): code = response.css('span[itemprop="productID"]::text').get() if code is None: return price_usd = response.css('span.price-sales::text').get().strip( '$ ').replace('.', '').replace(',', '.') price = {'usd': float(price_usd)} sku = SKU(self.brand_name, '', '', code, '', '', price, '', {}, {}) yield sku
def handle(self, response: Response) -> Result: price = "UNKNOWN" availability = response.css("div[id=outOfStock]").get() is None if availability: # Check if it is available new price = response.css("span[id=price_inside_buybox]::text").get() if not price: # Check if it is available used price = response.css("div[id=buyNew_noncbb]").css("span::text").get() if isinstance(price, str): price = price.strip() return Result(url=response.url, price=price, availability=availability)
def single_parse(self, response: Response): self.debug(response) if not response.xpath('//h1[@id="title"]/*/text()').get('').strip(): return Request(response.url, self.single_parse, dont_filter=True, errback=self.errors('single_parse')) log('product') now = dt.now(timezone('Asia/Tokyo')) product = AmazonItem() product['time'] = now.strftime('%Y-%m-%dT%H-%M-%S') product['title'] = response.xpath('//h1[@id="title"]/*/text()').get( '').strip() product['url'] = response.url review = response.css('span#acrCustomerReviewText::text').get('') product['review_num'] = review[0:-4] if review else 0 product['description'] = '\n'.join([ x.strip() for x in response.css( '#feature-bullets > ul > li *::text').getall() if x.strip() not in ('', 'モデル番号を入力してください', 'これが適合するか確認:') ]) seller = response.css('a#sellerProfileTriggerId') if seller: shop_name = seller.css('*::text').get('') seller_id = get_query_val(seller.attrib['href'], 'seller') shop_url = f'{amazon_url}/sp?seller={seller_id}' if seller_id else '' elif response.xpath('//*[@id="merchant-info"]/a'): shop_name = 'Amazon.co.jp' shop_url = 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070' else: shop_name = '-' shop_url = '' product['shop_name'] = shop_name product['shop_url'] = shop_url product['categories'] = ' > '.join([ el.get().strip() for el in response.css( '#wayfinding-breadcrumbs_feature_div > ul > li > span > a::text' ) ]) if shop_url == 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070': product['shop_address'] = '〒153-0064 東京都目黒区下目黒1-8-1 日本' yield product elif shop_url: yield Request(shop_url, self.shop_parse, meta={'product': product}, dont_filter=True, errback=self.errors('single_parse', response.url)) else: product['shop_address'] = '---' yield product
def parse_sku(self, response: Response): attrs = [] price_eur = response.css('div.itemPrice span.value::text').get().replace('.', '').replace(',', '.') price = { 'eur': float(price_eur) } code = response.css('div.item-mfc span.item-mfc-value::text').get().strip('Modello: ') sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse_sku(self, response: Response): code = response.css('div.product-number div::text').get() price = {} price_eur = response.css('span.price-sales::text').get() if price_eur is not None and len(price_eur) > 1: price_eur = price_eur.strip('€ ').replace(',', '.') price = { 'eur': float(price_eur), } sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], {}) yield sku
def parse_sku(self, response: Response): attrs = [] price_hkd = response.css( 'div.itemPrice span.value::text').get().replace(',', '') price = {'hkd': float(price_hkd)} code = response.css('div.item-mfc span.item-mfc-value::text').get( ).strip('Product ID: ') sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku
def parse_sku(self, response: Response): code = response.css('h2.iwc-buying-options-reference::text').get() tracking_product = json.loads( response.css('button[data-tracking-product]'). attrib['data-tracking-product']) usd = tracking_product['price'] price = {'usd': float(usd)} sku = SKU(self.brand_name, '', '', code, '', response.url, price, '', [], []) yield sku