class QidianSpider(RedisCrawlSpider): name = 'qidian' allowed_domains = ['qidian.com'] start_urls = ['https://www.qidian.com/all'] rules = [ Rule(LinkExtractor(restrict_css=('.all-img-list .book-img-box a',)), callback='parse_profile_page', follow=True), Rule(LinkExtractor(restrict_css=('.lbf-pagination-item-list .lbf-pagination-next ',)), follow=True), ] def parse_profile_page(self, response): # self.logger.debug('Parse Profile Page. URL : %s' % response.url) book = BookItem() name = response.css('.book-information .book-info h1 em::text').extract_first() url = response.url author = response.css('.book-information .book-info .writer::text').extract_first() tag = response.xpath('string(//div[contains(@class,"book-information")]/div[contains(@class,"book-info")]/p[@class="tag"])').extract_first() tag = re.sub('\s+', ' ', tag) words = response.css('.book-information .book-info p em::text').extract_first() chapters = response.css('.j_catalog_block a i span::text').extract_first() comments = response.css('.j_discussion_block a i span::text').extract_first() book['name'] = name book['url'] = url book['author'] = author book['tag'] = tag book['words'] = words book['chapters'] = chapters book['comments'] = comments yield book
class LiepinSpider(CrawlSpider): name = 'liepin' allowed_domains = ['liepin.com'] start_urls = ['https://www.liepin.com/zhaopin?key=python'] rules = ( # 详情url Rule(LinkExtractor( allow=r"https://www.liepin.com/job/\d+\.shtml.*", restrict_xpaths=['//ul[@class="sojob-list"]//a']), follow=False, callback="parse_detail"), # 翻页url Rule(LinkExtractor( allow=r"/zhaopin/.+curPage=\d+", restrict_xpaths='//div[@class="pagerbar"]//a'), follow=True), ) def parse_detail(self, response): print(response.url) title = response.css('.title-info h1::text').get() company = response.css('.title-info h3::text').get() city_lst = response.css('.basic-infor span::text').getall() city = ''.join(city_lst).strip() edu = response.css('.job-qualifications span:nth-child(1)::text').get() work = response.css('.job-qualifications span:nth-child(2)::text').get() desc_lst = response.css('.content-word::text').getall() desc = ''.join(desc_lst).strip() item = ZhaopinItem(title=title, company=company, city=city, edu=edu, work=work, desc=desc) yield item
class IsracardSpider(CrawlSpider): name = 'isracard' undetectable = False wait = False allowed_domains = ['benefits.isracard.co.il'] start_urls = ['https://benefits.isracard.co.il/'] brands = getBrands() rules = [ Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=itemHandler), callback='parse',process_request=my_selenium_request_processor,follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=categoryHandler),process_request=my_selenium_request_processor,follow=True), Rule(LinkExtractor(allow=(),tags=('div','a','area','button',), attrs=('onclick', ),process_value=mainHandler),callback='parse',process_request=my_selenium_request_processor,follow=True) ] def __init__(self, *args, **kwargs): super(IsracardSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): isValid = response.css('.benefit-details-txt').extract_first() is not None if isValid: description=cleanString(response.css("div.benefit-details-txt").extract()) title=cleanString(response.css("div.benefit-info h1::text").extract_first()) yield CouponsItem(Title=title, supplier='996', brand=filterBrands(cleanString(response.css("div.benefit-info h1::text").extract_first()),self.brands), JoinUrl=response.url, Description=description, ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), cyclerun=self.cycleid )
class BlogSpider(RedisCrawlSpider): name = 'blog' # allowed_domains = ['blog.com'] # start_urls = ['http://blog.com/'] redis_key = 'blog:start_urls' page_link = LinkExtractor(restrict_xpaths=('//li[@class="SG_pgnext"]/a')) content_link = LinkExtractor( restrict_xpaths=('//span[@class="atc_title"]/a')) rules = [ Rule(page_link, follow=True), Rule(content_link, callback='parse_content') ] def __init__(self, *args, **kwargs): # Dynamically define the allowed domains list. domain = kwargs.pop('domain', '') self.allowed_domains = filter(None, domain.split(',')) super(BlogSpider, self).__init__(*args, **kwargs) def parse_content(self, response): item = Blog() url = response.url title = response.xpath( '//h2[@class="titName SG_txta"]/text()').extract()[0].strip() item['url'] = url item['title'] = title yield item
class RecipeCrawlSpider0(CrawlSpider): name = 'gshow' allow_domains = ['gshow.globo.com'] start_urls = ['https://gshow.globo.com/receitas-gshow/'] rules = (Rule(LinkExtractor(restrict_xpaths=( "//div[@class='load-more gui-color-primary-bg']", ), ), callback='parse_category', follow=True), Rule(LinkExtractor(allow=("/receitas/", ), ), callback='parse_item', follow=False)) def parse_category(self, response): yield Request(response.url) def parse_item(self, response): item = DefaultItemLoader(item=Recipe(), response=response) item.add_xpath('dateModified', "//time[@itemprop='dateModified']/@datetime") item.add_xpath('datePublished', "//time[@itemprop='datePublished']/@datetime") item.add_xpath('description', "//meta[@name='description']/@content") item.add_xpath('image', "//meta[@itemprop='image']/@content") item.add_xpath('language', "//html/@lang") item.add_xpath('name', "//meta[@name='title']/@content") item.add_xpath('recipeIngredient', "//li[@itemprop='recipeIngredient']/text()") item.add_xpath('recipeInstructions', "//li[@itemprop='recipeInstructions']/text()") item.add_xpath('url', "//link[@rel='canonical']/@href") return item.load_item()
class MstxSpider(CrawlSpider): name = 'meishichina' allowed_domain = ['home.meishichina.com'] start_urls = ['http://home.meishichina.com/recipe-type.html'] rules = (Rule( LinkExtractor(allow=(r'http://home.meishichina.com/recipe/\w+/$')), follow=True), Rule(LinkExtractor( allow=(r'http://home.meishichina.com/recipe/\w+/page/\d+/$')), follow=True), Rule(LinkExtractor( allow=(r'http://home.meishichina.com/recipe-\d+.html$')), callback='save_page')) def save_page(self, response): name = response.xpath(".//*[@id='recipe_title']/text()").extract()[0] print(name) cwd = os.getcwd() + '/data/' + 'meishichina' if not os.path.exists(cwd): os.makedirs(cwd) with open(cwd + '/' + name + '.html', 'wb') as f: f.write(response.body) time.sleep(random.randint(0, 2))
class MovieSpiders(CrawlSpider): name = "doubanmoive" allowed_domains = ["movie.douban.com"] start_urls = ["http://movie.douban.com/top250"] rules = [ Rule( LinkExtractor( allow=(r'http://movie.douban.com/top250\?start=\d+.*'))), Rule(LinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')), callback="parse_item"), ] def parse_item(self, response): sel = Selector(response) item = DoubanmoiveItem() item['name'] = sel.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re( r'\((\d+)\)') item['score'] = sel.xpath( '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['director'] = sel.xpath( '//*[@id="info"]/span[1]/a/text()').extract() item['classification'] = sel.xpath( '//span[@property="v:genre"]/text()').extract() item['actor'] = sel.xpath( '//*[@id="info"]/span[3]/a[1]/text()').extract() return item
class BookSpider(CrawlSpider): name = 'book' start_urls = ['https://book.douban.com/top250?icn=index-book250-all'] rules = ( Rule(LinkExtractor(restrict_xpaths='//div[@class = "pl2"]/a'), callback='parse_item'), Rule(LinkExtractor(restrict_xpaths='//span[@class = "next"]/a'), follow=True), ) def parse_item(self, response): I = ItemLoader(item=Douban250Item(), response=response) I.add_xpath('BookName', '//h1/span/text()') I.add_xpath( 'Author', '//div[@id="info"]/span[contains(text(),"作者:")]/following-sibling::a[1]/text()' ) I.add_xpath( 'Press', '//div[@id="info"]/span[contains(text(),"出版社:")]/following::text()[1]' ) I.add_xpath( 'Time', '//div[@id="info"]/span[contains(text(),"出版年:")]/following::text()[1]' ) I.add_xpath( 'Price', '//div[@id="info"]/span[contains(text(),"定价:")]/following::text()[1]' ) I.add_xpath('Score', '//*[contains(@class,"rating_num")]/text()') return I.load_item()
class ArticleSpider(CrawlSpider): name = 'articles' allowed_domains = ['wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life'] rules = [ Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}), Rule(LinkExtractor(allow=r'.*'), callback='parse_items', cb_kwargs={'is_article': False}) ] def parse_items(self, response, is_article): url = response.url print('URL is: {}'.format(url)) title = response.css('h1::text').extract_first() if is_article: text = response.xpath( '//div[@id="mw-content-text"]//text()').extract() lastUpdated = response.css( 'li#footer-info-lastmod::text').extract_first() lastUpdated = lastUpdated.replace('This page was last edited on ', '') print('Title is: {} '.format(title)) print('Text is: {}'.format(text)) print('Last updated: {}'.format(lastUpdated)) else: print('This is not an article: {}'.format(title))
class TripAdvisoryReviews(CrawlSpider): name = "TripAdvisoryReviews" custom_settings = { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36', 'CLOSESPIDER_PAGECOUNT': 100 } start_urls = [ 'https://www.tripadvisor.cl/Hotels-g294292-Los_Lagos_Region-Hotels.html' ] # Tiempo a Esperar entre cada requerimiento que Scrapy haga a la pagina semilla download_delay = 1 allowed_domains = ['tripadvisor.cl'] rules = ( # Paginacion de Hoteles Rule(LinkExtractor(allow=r'-oa\d+-'), follow=True), # Detalle de hotels Rule(LinkExtractor( allow=r'/Hotel_Review-', restrict_xpaths=[ '//div[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]' ]), follow=True), # Paginacion de Reviews de Hoteles Rule(LinkExtractor(allow=r'-or\d+-'), follow=True), # Detalla de Reviews Rule(LinkExtractor( allow=r'/Profile/', restrict_xpaths=['//div[@data-test-target="reviews-tab"]']), follow=True, callback='parse_opinion'), ) def parse_opinion(self, response): # Como existen muchas opiniones por usuario, se debe recorrer sel = Selector(response) opiniones = sel.xpath('//div[@id="content"]/div/div') autor = sel.xpath('//h1/span/text()').get() for opinion in opiniones: item = ItemLoader(Opinion(), opinion) item.add_value('autor', autor) item.add_xpath('titulo', './/div[@class="_3IEJ3tAK _2K4zZcBv"]/text()') # div[@title] => divs que contengan el atributo title item.add_xpath( 'hotel', './/div[contains(@class, "ui_card section")]//div[@title]/text()' ) item.add_xpath( 'contenido', './/q/text()', MapCompose(lambda i: i.replace('\n', '').replace('\r', ''))) item.add_xpath( 'calificacion', './/div[contains(@class, "ui_card section")]//a/div/span[contains(@class, "ui_bubble_rating")]/@class', MapCompose(lambda i: i.split('_')[-1])) yield item.load_item()
class CommentarySpider(CrawlSpider): name = 'CommentarySpider' allowed_domains = ['wallstreetcn.com'] start_urls = [ 'http://wallstreetcn.com/news?status=published&type=news&order=-created_at&limit=30&page=1', ] rules = [ Rule(LxmlLinkExtractor(allow=("page=\d+",))), Rule(LxmlLinkExtractor(allow=("node/\d+")),follow=True,callback='parse_commentary'), ] def parse_commentary(self, response): sel = response.selector item = CommentaryItem() #get uri item['uri'] = get_base_url(response) print 'Download from uri: %s' % item['uri'] # log.msg('Download from uri: %s' % item['uri']) #get title _ = sel.xpath('//h1[@class="article-title"]/text()') item['title'] = '' if not _ else _[0].extract() #get time _ = sel.xpath('//span[@class="item time"]/text()') _time = '' if not _ else _[0].extract() if not _time: item['time'] = None else: _time = re.sub('[^\u4E00-\u9FA5\s]', '-',_time) _time = _time[:10]+'T'+_time[12:]+'Z' item['time'] = _time #get author _ = sel.xpath('//span[@class="item author"]/a/text()') item['author'] = '' if not _ else _[0].extract() #get description _ = sel.xpath('//meta[@name="description"]/@content') item['description'] = '' if not _ else _[0].extract()[:-84] #get content & imgs & view _ = sel.xpath('//div[@class="article-content"]') _ = _.extract()[0] _view = _[:-123]+'</div>' if len(_) > 200 else _ _content = BeautifulSoup(_view) item['content'] = _content.text _image_urls = [] for img in _content.find_all('img'): if img.has_key('src') and img['src'].startswith('http'): _image_urls.append(img['src']) elif img.has_key('alt') and img['alt'].startswith('http'): _image_urls.append(img['alt']) else: continue item['image_urls'] = _image_urls #item['image_urls'] = [img.src if img.src is not None and img.src.startswith('http') else img.alt if img.alt is not None else None for img in _content.find_all('img')] item['view'] = _view return item
class IGNSpider(CrawlSpider): # Cuando es Spider Vertical u Horizatonal name = "IGN" custom_settings = { 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36', # Numero maximo de paginas en las cuales voy a descargar items. Scrapy se cierra cuando alcanza este numero 'CLOSESPIDER_PAGECOUNT': 30 } allowed_domains = ['latam.ign.com'] start_urls = ['https://latam.ign.com/se/?type=video&q=nintendo%20switch'] # Tiempo a Esperar entre cada requerimiento que Scrapy haga a la pagina semilla download_delay = 1 rules = ( # Horizotalidad por Tipo de Informacion Rule(LinkExtractor(allow=r'type='), follow=True), # Horizantalidad por Paginacion Rule(LinkExtractor(allow=r'&page=\d+'), follow=True), # Regla por cada tipo de contenido # Articulo Rule(LinkExtractor(allow=r'/news/'), follow=True, callback='parse_articulo'), # Review Rule(LinkExtractor(allow=r'/review/'), follow=True, callback='parse_review'), # Videos Rule(LinkExtractor(allow=r'/video/'), follow=True, callback='parse_video'), ) def parse_articulo(self, response): item = ItemLoader(Articulo(), response) item.add_xpath('Titulo', './/h1/text()') item.add_xpath('Contenido', './/div[@id="id_text"]//*/text()') yield item.load_item() def parse_review(self, response): item = ItemLoader(Reviews(), response) item.add_xpath('Titulo', './/div[@class="article-headline"]/h1/text()') item.add_xpath( 'Calificacion', '//span[@class="side-wrapper side-wrapper hexagon-content"]/text()' ) yield item.load_item() def parse_video(self, response): item = ItemLoader(Videos(), response) item.add_xpath('Titulo', './/h1/text()') item.add_xpath('FechaPublicacion', './/span[@class="publish-date"]/text()') yield item.load_item()
class CrawlQidianSpider(CrawlSpider): name = 'Crawl-qidian' allowed_domains = ['www.qidian.com', 'book.qidian.com'] # start_urls = ['http://www.qidian.com/'] start_urls = [ 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1' ] # 定义link_extractor # 入口地址和详细页面链接 main_page = LinkExtractor( allow= 'action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=\d', restrict_xpaths='//*[@class="lbf-pagination-next"]') page_info = LinkExtractor(allow='//book.qidian.com/info/\d') rules = (Rule(main_page, follow=False), Rule(page_info, callback='parse_item', follow=True)) def parse_item(self, response): try: book_name = response.xpath( ".//*[@class='book-info ']/h1/em/text()").extract()[0] except: book_name = 'noknow' try: book_author = response.xpath( ".//*[@class='book-info ']/h1/span/a/text()").extract()[0] except: book_author = 'noknow' try: book_span_tags = response.xpath( ".//*[@class='book-info ']/p[@class='tag']/span/text()" ).extract() except: book_span_tags = 'noknow' try: book_a_tags = response.xpath( ".//*[@class='book-info ']/p[@class='tag']/a/text()").extract( ) except: book_a_tags = 'unknow' try: book_intro = response.xpath( ".//*[@class='book-info ']/p[@class='intro']/text()").extract( )[0] except: book_intro = 'unknow' try: book_score_str = response.xpath(".//*[@id='j_bookScore']") book_score = book_score_str.xpath( ".//span/*[@id='score1']/text()").extract( )[0] + book_score_str.xpath( ".//span/em/text()").extract()[0] + book_score_str.xpath( ".//*[@id='score2']/text()").extract()[0] except: book_score = 'unknow' # print(response.body) print(book_name, book_author, book_span_tags, book_a_tags, book_intro) pass
class OgenSpider(CrawlSpider): name = 'ogen' undetectable = False wait = False allowed_domains = ['ogen.org.il'] start_urls = ['https://ogen.org.il/'] brands = getBrands() # ajax_url = 'https://ogen.org.il/wp-admin/admin-ajax.php' # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\nmatat_filter\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"data\"\r\n\r\nminPrice=0&maxPrice=1000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"security\"\r\n\r\ncb03a93ccd\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--" # headers = { # 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW", # 'cache-control': "no-cache", # 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' # } rules = [ Rule( LinkExtractor(allow=('/product-category/')), follow=True, ), Rule(LinkExtractor(allow=('/product/')), callback='parse') ] # def start_requests(self): # yield scrapy.Request(self.start_urls[0], callback=self.ajax_parse) # def ajax_parse(self, response): # result = requests.request("POST", self.ajax_url, data=self.payload, headers=self.headers) # response = HtmlResponse(self.ajax_url, body=result.text, encoding='utf-8') # products = [i for i in response.css("a::attr(href)").extract() if re.search(r"/product/",i)] # for links in products: # yield scrapy.Request(links, callback=self.parse) # return super(OgenSpider, self).start_requests() def __init__(self, *args, **kwargs): super(OgenSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): description = cleanString( response.css("div.short-info p::text").getall()) title = cleanString( response.css("h2.product-name::text").get()) + cleanString( response.css("div.price").extract()) yield CouponsItem( Title=title, supplier='992', brand=filterBrands( cleanString(response.css("h2.product-name::text").get()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class StackoverflowSpider(CrawlSpider): name = "stackoverflow" start_urls = [] allowed_domains = [] url = '' rules = ( Rule(LinkExtractor(allow=url), callback='parse_asd'), Rule(LinkExtractor(allow=url), follow=True), ) def parse_asd(self, response): item = QuestionItem() for quote in response.css('html'): counters_anal = quote.css('script').extract() if 'https://www.google-analytics.com/analytics.js' in str( counters_anal): yee = 'Yes' else: yee = 'No' if 'mc.yandex.ru/metrika' in str(counters_anal): res = 'Yes' else: res = 'No' title = quote.css('title::text').extract_first(), item['title'] = Ch.check('title', title) description = quote.css( 'meta[name*=description]::attr(content), meta[name*=Description]::attr(content)' ).extract(), h1 = quote.css('h1::text').extract(), h2 = quote.css('h2::text, H2::text').extract(), item['description'] = Ch.check('description', description) item['h1'] = Ch.check('h1', h1) item['h2'] = Ch.check('h2', h2) item['keyword'] = quote.css( 'meta[name*=Keywords]::attr(content), meta[name*=keywords]::attr(content)' ).extract(), item['link'] = response.url item['text'] = quote.css('p::text, span::text').extract(), item['googl_anal'] = yee, item['yandex_metrick'] = res, return item def start_spider(self, url, short_url): self.start_urls.append(url) self.allowed_domains.append(short_url) self.url = url settings = get_project_settings() configure_logging(settings=settings) runner = CrawlerRunner(settings=get_project_settings()) d = runner.crawl(StackoverflowSpider) d.addCallback(lambda response: reactor.stop()) reactor.callLater(3, d.addCallback, None) reactor.run(installSignalHandlers=0)
class DinersSpider(CrawlSpider): name = 'diners' undetectable = True wait = True elementId = 'cal-shop-brand' allowed_domains = ['diners-store.co.il'] start_urls = ['https://www.diners-store.co.il/'] brands = getBrands() integrator = '-כותרת משנה' rules = [ Rule(LinkExtractor(allow=(), process_value=itemHandler), callback='parse', process_request=my_selenium_request_processor, follow=False), Rule(LinkExtractor(allow=(), process_value=categoryHandler), process_request=my_selenium_request_processor, follow=True) ] def __init__(self, *args, **kwargs): super(DinersSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): description = cleanString( response.css("div#full-description-text").extract()) if not description: description = cleanString( response.css("div.banner-club-big-text-box").extract()) greenbox = cleanString( response.css("h1.productTitle").extract()) + cleanString( response.css("div.productSubTitle").extract()) big_redbox = cleanString( response.css("td.product-list-checkboxes").extract()) if similar(greenbox, big_redbox) > 0.9: title = greenbox else: low_price = re.search(r"'PriceDiscount':\s'\d{1,}'", str(response.body)).group(0) if re.search( r"'PriceDiscount':\s'\d{1,}'", str(response.body)) else '' title = greenbox + self.integrator + big_redbox + low_price.replace( "'PriceDiscount':", '') yield CouponsItem( Title=title, supplier='16', brand=filterBrands( cleanString(response.css("h1.productTitle").extract()), self.brands), JoinUrl=response.url, Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class PaisplusSpider(CrawlSpider): name = 'pais' undetectable = True wait = True elementId = 'accesability_container' allowed_domains = ['paisplus.co.il'] apiBase = 'https://data.dolcemaster.co.il' start_urls = ['https://paisplus.co.il/'] siteUuid = 'BBAD629F-E549-4612-9EAE-3AA9E85F1C33' linkBase = 'https://www.paisplus.co.il/benefits/' getBenefitDetails = urllib.parse.urljoin( apiBase, f'api/v5_1/public/benefits_details') headers = {'Accept': 'Accept: application/json'} brands = getBrands() rules = [ Rule(LinkExtractor(allow=('/category/')), process_request=my_selenium_request_processor, follow=True), Rule(LinkExtractor(allow=('/benefits/')), callback='parse', process_request=my_selenium_request_processor, follow=False) ] def __init__(self, *args, **kwargs): super(PaisplusSpider, self).__init__(*args, **kwargs) self.cycleid = kwargs.get('cycleid', '') def parse(self, response): m = re.search(r'https://www\.paisplus\.co\.il/benefits/(.+)/', response.url) if m: benefit_id = m.group(1) formdata = { 'club_id': f'{self.siteUuid}', 'benefits_id': f'{benefit_id}' } r = requests.post(self.getBenefitDetails, json=formdata) if r.status_code == 200: data = r.json().get('benefits')[0] description = cleanString(data['benefits_description']) title = cleanString(data['benefits_name']) yield CouponsItem( Title=title, supplier='991', brand=filterBrands(description, self.brands), JoinUrl=self.linkBase + data['benefits_id'], Description=description, ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), DoorToDoorShipping=any(ext in (description + title) for ext in allowed_shipping_list), cyclerun=self.cycleid)
class AshvsashSpider(CrawlSpider): name = 'ashvsash' allowed_domains = ['m.ashvsash.com'] start_urls = ['http://m.ashvsash.com/category/电影/'] rules = [ Rule(LinkExtractor(allow='page/\d+'), follow=True), Rule(LinkExtractor(allow='/\d{4}/\d{2}/\d+'), callback='parse_item', follow=True, process_links='process_links') ] def __init__(self): CrawlSpider.__init__(self) self.sqlite_file = SQLITE_FILE self.sqlite_table = SQLITE_TABLE self.conn = sqlite3.connect(self.sqlite_file) def parse_item(self, response): item = MoviespiderItem() article = response.css('div.article_container') name = article.css('h1::text').extract_first() image = article.css('.context img::attr(src)').extract_first() link = response.url ctime = article.css('.article_info .info_date::text').extract_first() category = article.css( '.article_info .info_category a::text').extract_first() description = article.css('div[id=post_content]').extract_first() pan = response.css('.context h2').extract()[-1] item['name'] = name item['image'] = image item['link'] = link item['ctime'] = ctime item['category'] = category item['description'] = description item['pan'] = pan yield item def process_links(self, links): for link in links: url = link.url if url.endswith('/#respond'): length = len('/#respond') url = url[:-length] if url.endswith('/'): url = url.strip('/') cur = self.conn.execute( 'select count(*) from tb_link where link=?;', (url, )) size = cur.fetchone()[0] if size == 0: yield link
class HvrSpider(CrawlSpider): name = 'hvr' undetectable = True elementId = 'wrap' wait = True allowed_domains = ['hvr.co.il'] start_urls = [] signin_url = 'https://hvr.co.il/signin.aspx' usrEId,username = '******','052046133' pwdEId,password = '******','5167722' brands = getBrands() headers = {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'} rules = [ Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('data-item_id',),process_value=process_item_id),callback="parse_item",follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_item_href),callback="parse_item",follow=False), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_lst),callback="parse_lst",follow=True), Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_cat),follow=True) ] def start_requests(self): yield scrapy.Request(self.signin_url, callback=self.after_login,meta={"selenium":True,"login":True,"elementId":"tz"}) def after_login(self,response): result = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(title)").extract() if re.search(r"(.+?)\.json",i)] result2 = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(data-json)").extract() if re.search(r"(.+?)\.json",i)] for uri in result+result2: uri = re.search(r"(?<=\\).*",uri).group(0) if re.search(r"(?<=\\).*",uri) else uri r = requests.get('https://www.hvr.co.il/ajax/'+uri,headers=self.headers) m = re.findall(r"(page|url)':\s?'(.+?)'",str(r.json())) if r.status_code == 200 else None if m: for t,s in m: n = re.search(r"(?=home_page\.aspx).*", s) if t == 'page': url = 'https://www.hvr.co.il/home_page.aspx?page=' + s elif t == 'url' and n: url = 'https://www.hvr.co.il/'+n.group(0) self.start_urls.append(url) self.start_urls.append(response.url) return super(HvrSpider, self).start_requests() def parse_lst(self, response): template_links = re.findall(r'template_link:\s?"(.+)\d{5,8}"',str(response.body)) for uri in template_links: yield scrapy.Request(urllib.parse.urljoin('https://www.hvr.co.il/',uri), callback=self.parse_item) def parse_item(self, response): print(response.url) parse_start_url = parse_item
class CrawlSpiderWithErrback(CrawlSpiderWithParseMethod): name = 'crawl_spider_with_errback' rules = (Rule(LinkExtractor(), callback='parse', errback='errback', follow=True), ) def start_requests(self): test_body = b""" <html> <head><title>Page title<title></head> <body> <p><a href="/status?n=200">Item 200</a></p> <!-- callback --> <p><a href="/status?n=201">Item 201</a></p> <!-- callback --> <p><a href="/status?n=404">Item 404</a></p> <!-- errback --> <p><a href="/status?n=500">Item 500</a></p> <!-- errback --> <p><a href="/status?n=501">Item 501</a></p> <!-- errback --> </body> </html> """ url = self.mockserver.url("/alpayload") yield Request(url, method="POST", body=test_body) def errback(self, failure): self.logger.info('[errback] status %i', failure.value.response.status)
class LiepinSpiderSpider(CrawlSpider): name = 'liepin_spider' allowed_domains = ['liepin.com'] start_urls = [ 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python' ] # rules中最后一个Rule要有逗号 rules = ( Rule(LinkExtractor(allow=r"https://www.liepin.com/job/\d+\.shtml.*", restrict_xpaths=['//ul[@class="sojob-list"]//a']), callback="parse_job", follow=False), # Rule(LinkExtractor(allow=r"zhaopin/.+?curPage=\d+",restrict_xpaths=["//div[@class='pager']//a"]),follow=True) ) def parse_job(self, response): title = response.css(".title-info h1::text").get() salary = response.css(".job-title-left p::text").get().strip() edu = response.css( ".job-qualifications span:nth-child(1) ::text").get() experience = response.css( ".job-qualifications span:nth-child(2) ::text").get() work_need_list = response.css(".content-word::text").getall() work_need = "".join(work_need_list).strip() item = LiepinItem(title=title, salary=salary, edu=edu, experience=experience, work_need=work_need) yield item
class QiubaiSpider(CrawlSpider): name = 'qiubai' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/8hr/page/1/'] rules = [ # allow:提取符合对应正则表达式的链接,deny相反 # restrict_xpaths使用xpath和正则表达式共同作用 # allow_domains,deny_domains # follow是否沿着链接往下追踪 Rule(LinkExtractor(allow='.*?/8hr/page/\d'), callback='parse_item', follow=True), ] def parse_item(self, response): user_info = response.xpath('//div[@class="author clearfix"]') content_info = response.xpath('//div[@class="content"]') for li, content in zip(user_info, content_info): from simpleDemo.items import QiubaiItem item = QiubaiItem() item['author'] = li.xpath('.//h2/text()').extract()[0] lineContent = "" contentList = content.xpath('span/text()').extract() for line in contentList: lineContent += line item['content'] = lineContent yield item
class CrawlSpiderWithParseMethod(MockServerSpider, CrawlSpider): """ A CrawlSpider which overrides the 'parse' method """ name = 'crawl_spider_with_parse_method' custom_settings: dict = { 'RETRY_HTTP_CODES': [], # no need to retry } rules = (Rule(LinkExtractor(), callback='parse', follow=True), ) def start_requests(self): test_body = b""" <html> <head><title>Page title<title></head> <body> <p><a href="/status?n=200">Item 200</a></p> <!-- callback --> <p><a href="/status?n=201">Item 201</a></p> <!-- callback --> </body> </html> """ url = self.mockserver.url("/alpayload") yield Request(url, method="POST", body=test_body) def parse(self, response, foo=None): self.logger.info('[parse] status %i (foo: %s)', response.status, foo) yield Request(self.mockserver.url("/status?n=202"), self.parse, cb_kwargs={"foo": "bar"})
class CrawlSpiderWithErrback(MockServerSpider, CrawlSpider): name = 'crawl_spider_with_errback' custom_settings = { 'RETRY_HTTP_CODES': [], # no need to retry } rules = (Rule(LinkExtractor(), callback='callback', errback='errback', follow=True), ) def start_requests(self): test_body = b""" <html> <head><title>Page title<title></head> <body> <p><a href="/status?n=200">Item 200</a></p> <!-- callback --> <p><a href="/status?n=201">Item 201</a></p> <!-- callback --> <p><a href="/status?n=404">Item 404</a></p> <!-- errback --> <p><a href="/status?n=500">Item 500</a></p> <!-- errback --> <p><a href="/status?n=501">Item 501</a></p> <!-- errback --> </body> </html> """ url = self.mockserver.url("/alpayload") yield Request(url, method="POST", body=test_body) def callback(self, response): self.logger.info('[callback] status %i', response.status) def errback(self, failure): self.logger.info('[errback] status %i', failure.value.response.status)
class DemoSpider(CrawlSpider): name = 'demo' allowed_domains = ['www.transfermarkt.com'] start_urls = ['https://www.transfermarkt.com/statistik/saisontransfers'] # allowed_domains = ['how2j.cn'] # start_urls = ['https://how2j.cn/stage/33.html'] rules = ( Rule(LinkExtractor(restrict_xpaths=u'//li[@class="naechste-seite"]/a'), callback='parse_next', follow=True), ) def parse_next(self, response): html = response.xpath('//div[@class="responsive-table"]/div[@class="grid-view"]/table/tbody/tr') # html = response.xpath('//a[@class="list-group-item moduleItemLeft"]/span') for each in html: item = MyscrapyItem() name = each.xpath('./td[2]/table[@class="inline-table"]/tr[1]/td[@class="hauptlink"]/a/text()').extract() age = each.xpath('./td[3]/text()').extract() value = each.xpath('./td[4]/text()').extract() # name = each.xpath('./td[2]/table/tbody/tr[2]/td/text()').extract() # name = each.xpath('./text()').extract() item['name'] = name[0] item['age'] = age[0] item['value'] = value[0] # item['name'] = '111' yield item
class RestaurantSpider(CrawlSpider): name = "RestaurantSpider" allowed_domains = ["domiciliosbogota.com"] start_urls = ('http://www.domiciliosbogota.com/', ) productLinkGetter = ProductLinkGetter() rules = [ Rule(LinkExtractor(allow=(r"http://www\.domiciliosbogota\.com/$")), 'parseMain') ] def parseMain(self, response): self.restaurantIDsGetter = RestaurantIDsGetter(response) linksExtractor = LinkExtractor( allow=(r"http\:\/\/www\.domiciliosbogota\.com\/domicilios\-.*")) links = linksExtractor.extract_links(response) for link in links: yield Request(link.url, callback=self.parseRestaurants) def parseRestaurants(self, response): sel = RestaurantSelector(response) restaurant = Restaurant() restaurant["url"] = response.url restaurant["name"] = sel.getName() restaurant["id"] = self.restaurantIDsGetter.getID( "/" + response.url.split("/")[-1]) restaurant["deliveryTimeInMinutes"] = sel.getDeliveryTimeInMinutes() restaurant["minOrderPrice"] = sel.getMinOrderPrice() restaurant["deliveryCost"] = sel.getDeliveryCost() restaurant["payMethods"] = sel.getPayMethods() restaurant["menu"] = sel.getMenuCategories() restaurant["tagCategories"] = sel.getTagCategories() restaurant["averagePunctuation"] = sel.getAveragePunctuation() restaurant["quantityOfComments"] = sel.getQuantityOfComments() return restaurant
class RustSpider(scrapy.Spider): name = "rust" allowed_domains = ["academic.oup.com/rheumatology"] start_urls = ( "https://academic.oup.com/rheumatology/list-of-issues/2000?jn=Rheumatology", ) rules = [ Rule(LinkExtractor(canonicalize=True, unique=True), follow=True, callback="parse_items") ] def parse(self, response): extractor = LinkExtractor() links = extractor.extract_links(response) for link in links: l = link.url yield Request(url=l, callback=self.parse_article, dont_filter=False) def parse_article(self, response): # if there is a pdf link for href in response.css('a[href$=".pdf"]::attr(href)').extract(): print(href) yield (href)
class V1Crawler(CrawlSpider): name = "v1crawler" custom_settings = { } allowed_domains = [ 'news.baidu.com' ] start_urls = [ 'http://news.baidu.com' ] rules = ( Rule(LinkExtractor(), callback='parse_rsp'), ) @classmethod def schedule_runner(cls): print("++++++++++++++++++++++") # do not override parse method def parse_rsp(self, response): logger.info("+++++ %s +++++" % response.url) """
class FunesSpider(CrawlSpider): name = 'funes' allowed_domains = [] handle_httpstatus_all = True rules = [ Rule(LxmlLinkExtractor(allow=(), process_value=formatLink), 'parse_items', follow=True) ] # customs_settings = { # 'FEED_URI' : '%(domain).csv' # } # def __init__(self, *args, **kwargs): super(FunesSpider, self).__init__(*args, **kwargs) # self.domain = kwargs.pop("domain","") self.allowed_domains.append(self.domain) def start_requests(self): yield Request(url='http://' + self.domain + '/') def parse_items(self, response): item = HttpScrapperItem() item["url"] = response.url item["status"] = response.status return item
class IataSpider(CrawlSpider): ''' Crawl wikipedia collecting airport data. ''' name = 'iata' allowed_domains = ['wikipedia.org'] start_urls = ['https://en.wikipedia.org/wiki/IATA_airport_code'] rules = (Rule(LinkExtractor(restrict_xpaths=[ '//div[@class="mw-parser-output"]' '//a[contains(@href, "/wiki/List_of_airports_by_IATA")]' ]), callback='get_airport'), ) def get_airport(self, response): # pylint: disable=no-self-use, missing-docstring for record in response.xpath( '//table[contains(@class, "sortable")]//tr[td]'): _x = record.xpath airport = {} airport['iata'] = _x('.//td[1]/text()').extract_first() airport['icao'] = _x('.//td[2]/text()').extract_first() airport['name'] = ''.join(_x('.//td[3]//text()').extract()) airport['location'] = ''.join(_x('.//td[4]//text()').extract()) airport['time'] = _x('.//td[5]//text()').extract_first() airport['dst'] = _x('.//td[6]//text()').extract_first() # getting rid of empty or '\n' strings yield AirportItem( **{ k: v.strip() if v and v.strip() else None for k, v in airport.items() })