Ejemplo n.º 1
0
class QidianSpider(RedisCrawlSpider):
    name = 'qidian'
    allowed_domains = ['qidian.com']

    start_urls = ['https://www.qidian.com/all']
    rules = [
        Rule(LinkExtractor(restrict_css=('.all-img-list .book-img-box a',)), callback='parse_profile_page', follow=True),
        Rule(LinkExtractor(restrict_css=('.lbf-pagination-item-list .lbf-pagination-next ',)), follow=True),
    ]

    def parse_profile_page(self, response):
        # self.logger.debug('Parse Profile Page. URL :  %s' % response.url)
        book = BookItem()

        name = response.css('.book-information .book-info  h1 em::text').extract_first()
        url = response.url
        author = response.css('.book-information .book-info .writer::text').extract_first()

        tag = response.xpath('string(//div[contains(@class,"book-information")]/div[contains(@class,"book-info")]/p[@class="tag"])').extract_first()
        tag = re.sub('\s+', ' ', tag)

        words = response.css('.book-information .book-info p em::text').extract_first()
        chapters = response.css('.j_catalog_block a i span::text').extract_first()
        comments = response.css('.j_discussion_block a i span::text').extract_first()

        book['name'] = name
        book['url'] = url
        book['author'] = author
        book['tag'] = tag
        book['words'] = words
        book['chapters'] = chapters
        book['comments'] = comments

        yield book
Ejemplo n.º 2
0
class LiepinSpider(CrawlSpider):
	name = 'liepin'
	allowed_domains = ['liepin.com']
	start_urls = ['https://www.liepin.com/zhaopin?key=python']

	rules = (
		# 详情url
		Rule(LinkExtractor(
			allow=r"https://www.liepin.com/job/\d+\.shtml.*",
			restrict_xpaths=['//ul[@class="sojob-list"]//a']),
			follow=False, callback="parse_detail"),
		# 翻页url
		Rule(LinkExtractor(
			allow=r"/zhaopin/.+curPage=\d+",
			restrict_xpaths='//div[@class="pagerbar"]//a'), follow=True),

	)

	def parse_detail(self, response):
		print(response.url)
		title = response.css('.title-info h1::text').get()
		company = response.css('.title-info h3::text').get()
		city_lst = response.css('.basic-infor span::text').getall()
		city = ''.join(city_lst).strip()
		edu = response.css('.job-qualifications span:nth-child(1)::text').get()
		work = response.css('.job-qualifications span:nth-child(2)::text').get()
		desc_lst = response.css('.content-word::text').getall()
		desc = ''.join(desc_lst).strip()
		item = ZhaopinItem(title=title, company=company, city=city, edu=edu, work=work, desc=desc)
		yield item
Ejemplo n.º 3
0
class IsracardSpider(CrawlSpider):
    name = 'isracard'
    undetectable = False
    wait = False
    allowed_domains = ['benefits.isracard.co.il']
    start_urls = ['https://benefits.isracard.co.il/']
    brands = getBrands()
    rules = [
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=itemHandler),
             callback='parse',process_request=my_selenium_request_processor,follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=categoryHandler),process_request=my_selenium_request_processor,follow=True),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button',),
                          attrs=('onclick', ),process_value=mainHandler),callback='parse',process_request=my_selenium_request_processor,follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(IsracardSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')   

    def parse(self, response):
        isValid = response.css('.benefit-details-txt').extract_first() is not None
        if isValid:
            description=cleanString(response.css("div.benefit-details-txt").extract())
            title=cleanString(response.css("div.benefit-info h1::text").extract_first())
            yield CouponsItem(Title=title,
                            supplier='996', 
                            brand=filterBrands(cleanString(response.css("div.benefit-info h1::text").extract_first()),self.brands),
                            JoinUrl=response.url,
                            Description=description,
                            ScrapeDate = datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                            DoorToDoorShipping= any(ext in (description+title) for ext in allowed_shipping_list), 
                            cyclerun=self.cycleid )
Ejemplo n.º 4
0
class BlogSpider(RedisCrawlSpider):
    name = 'blog'
    # allowed_domains = ['blog.com']
    # start_urls = ['http://blog.com/']
    redis_key = 'blog:start_urls'

    page_link = LinkExtractor(restrict_xpaths=('//li[@class="SG_pgnext"]/a'))
    content_link = LinkExtractor(
        restrict_xpaths=('//span[@class="atc_title"]/a'))
    rules = [
        Rule(page_link, follow=True),
        Rule(content_link, callback='parse_content')
    ]

    def __init__(self, *args, **kwargs):
        # Dynamically define the allowed domains list.
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(BlogSpider, self).__init__(*args, **kwargs)

    def parse_content(self, response):
        item = Blog()
        url = response.url
        title = response.xpath(
            '//h2[@class="titName SG_txta"]/text()').extract()[0].strip()
        item['url'] = url
        item['title'] = title
        yield item
Ejemplo n.º 5
0
class RecipeCrawlSpider0(CrawlSpider):
    name = 'gshow'

    allow_domains = ['gshow.globo.com']
    start_urls = ['https://gshow.globo.com/receitas-gshow/']

    rules = (Rule(LinkExtractor(restrict_xpaths=(
        "//div[@class='load-more gui-color-primary-bg']", ), ),
                  callback='parse_category',
                  follow=True),
             Rule(LinkExtractor(allow=("/receitas/", ), ),
                  callback='parse_item',
                  follow=False))

    def parse_category(self, response):
        yield Request(response.url)

    def parse_item(self, response):
        item = DefaultItemLoader(item=Recipe(), response=response)

        item.add_xpath('dateModified',
                       "//time[@itemprop='dateModified']/@datetime")
        item.add_xpath('datePublished',
                       "//time[@itemprop='datePublished']/@datetime")
        item.add_xpath('description', "//meta[@name='description']/@content")
        item.add_xpath('image', "//meta[@itemprop='image']/@content")
        item.add_xpath('language', "//html/@lang")
        item.add_xpath('name', "//meta[@name='title']/@content")
        item.add_xpath('recipeIngredient',
                       "//li[@itemprop='recipeIngredient']/text()")
        item.add_xpath('recipeInstructions',
                       "//li[@itemprop='recipeInstructions']/text()")
        item.add_xpath('url', "//link[@rel='canonical']/@href")

        return item.load_item()
Ejemplo n.º 6
0
class MstxSpider(CrawlSpider):

    name = 'meishichina'
    allowed_domain = ['home.meishichina.com']
    start_urls = ['http://home.meishichina.com/recipe-type.html']

    rules = (Rule(
        LinkExtractor(allow=(r'http://home.meishichina.com/recipe/\w+/$')),
        follow=True),
             Rule(LinkExtractor(
                 allow=(r'http://home.meishichina.com/recipe/\w+/page/\d+/$')),
                  follow=True),
             Rule(LinkExtractor(
                 allow=(r'http://home.meishichina.com/recipe-\d+.html$')),
                  callback='save_page'))

    def save_page(self, response):
        name = response.xpath(".//*[@id='recipe_title']/text()").extract()[0]
        print(name)
        cwd = os.getcwd() + '/data/' + 'meishichina'
        if not os.path.exists(cwd):
            os.makedirs(cwd)
        with open(cwd + '/' + name + '.html', 'wb') as f:
            f.write(response.body)
        time.sleep(random.randint(0, 2))
Ejemplo n.º 7
0
class MovieSpiders(CrawlSpider):
    name = "doubanmoive"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["http://movie.douban.com/top250"]
    rules = [
        Rule(
            LinkExtractor(
                allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
        Rule(LinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),
             callback="parse_item"),
    ]

    def parse_item(self, response):
        sel = Selector(response)
        item = DoubanmoiveItem()
        item['name'] = sel.xpath(
            '//*[@id="content"]/h1/span[1]/text()').extract()
        item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(
            r'\((\d+)\)')
        item['score'] = sel.xpath(
            '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
        item['director'] = sel.xpath(
            '//*[@id="info"]/span[1]/a/text()').extract()
        item['classification'] = sel.xpath(
            '//span[@property="v:genre"]/text()').extract()
        item['actor'] = sel.xpath(
            '//*[@id="info"]/span[3]/a[1]/text()').extract()
        return item
Ejemplo n.º 8
0
class BookSpider(CrawlSpider):
    name = 'book'
    start_urls = ['https://book.douban.com/top250?icn=index-book250-all']
    rules = (
        Rule(LinkExtractor(restrict_xpaths='//div[@class = "pl2"]/a'),
             callback='parse_item'),
        Rule(LinkExtractor(restrict_xpaths='//span[@class = "next"]/a'),
             follow=True),
    )

    def parse_item(self, response):
        I = ItemLoader(item=Douban250Item(), response=response)
        I.add_xpath('BookName', '//h1/span/text()')
        I.add_xpath(
            'Author',
            '//div[@id="info"]/span[contains(text(),"作者:")]/following-sibling::a[1]/text()'
        )
        I.add_xpath(
            'Press',
            '//div[@id="info"]/span[contains(text(),"出版社:")]/following::text()[1]'
        )
        I.add_xpath(
            'Time',
            '//div[@id="info"]/span[contains(text(),"出版年:")]/following::text()[1]'
        )
        I.add_xpath(
            'Price',
            '//div[@id="info"]/span[contains(text(),"定价:")]/following::text()[1]'
        )
        I.add_xpath('Score', '//*[contains(@class,"rating_num")]/text()')
        return I.load_item()
Ejemplo n.º 9
0
class ArticleSpider(CrawlSpider):
    name = 'articles'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
    rules = [
        Rule(LinkExtractor(allow='(/wiki/)((?!:).)*$'),
             callback='parse_items',
             follow=True,
             cb_kwargs={'is_article': True}),
        Rule(LinkExtractor(allow=r'.*'),
             callback='parse_items',
             cb_kwargs={'is_article': False})
    ]

    def parse_items(self, response, is_article):
        url = response.url
        print('URL is: {}'.format(url))
        title = response.css('h1::text').extract_first()
        if is_article:
            text = response.xpath(
                '//div[@id="mw-content-text"]//text()').extract()
            lastUpdated = response.css(
                'li#footer-info-lastmod::text').extract_first()
            lastUpdated = lastUpdated.replace('This page was last edited on ',
                                              '')
            print('Title is: {} '.format(title))
            print('Text is: {}'.format(text))
            print('Last updated: {}'.format(lastUpdated))
        else:
            print('This is not an article: {}'.format(title))
Ejemplo n.º 10
0
class TripAdvisoryReviews(CrawlSpider):
    name = "TripAdvisoryReviews"
    custom_settings = {
        'USER_AGENT':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        'CLOSESPIDER_PAGECOUNT': 100
    }
    start_urls = [
        'https://www.tripadvisor.cl/Hotels-g294292-Los_Lagos_Region-Hotels.html'
    ]
    # Tiempo a Esperar entre cada requerimiento que Scrapy haga a la pagina semilla
    download_delay = 1
    allowed_domains = ['tripadvisor.cl']

    rules = (
        # Paginacion de Hoteles
        Rule(LinkExtractor(allow=r'-oa\d+-'), follow=True),
        # Detalle de hotels
        Rule(LinkExtractor(
            allow=r'/Hotel_Review-',
            restrict_xpaths=[
                '//div[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]'
            ]),
             follow=True),
        # Paginacion de Reviews de Hoteles
        Rule(LinkExtractor(allow=r'-or\d+-'), follow=True),

        # Detalla de Reviews
        Rule(LinkExtractor(
            allow=r'/Profile/',
            restrict_xpaths=['//div[@data-test-target="reviews-tab"]']),
             follow=True,
             callback='parse_opinion'),
    )

    def parse_opinion(self, response):
        # Como  existen muchas opiniones por usuario, se debe recorrer
        sel = Selector(response)
        opiniones = sel.xpath('//div[@id="content"]/div/div')
        autor = sel.xpath('//h1/span/text()').get()

        for opinion in opiniones:
            item = ItemLoader(Opinion(), opinion)
            item.add_value('autor', autor)
            item.add_xpath('titulo',
                           './/div[@class="_3IEJ3tAK _2K4zZcBv"]/text()')
            # div[@title] => divs que contengan el atributo title
            item.add_xpath(
                'hotel',
                './/div[contains(@class, "ui_card section")]//div[@title]/text()'
            )
            item.add_xpath(
                'contenido', './/q/text()',
                MapCompose(lambda i: i.replace('\n', '').replace('\r', '')))
            item.add_xpath(
                'calificacion',
                './/div[contains(@class, "ui_card section")]//a/div/span[contains(@class, "ui_bubble_rating")]/@class',
                MapCompose(lambda i: i.split('_')[-1]))
            yield item.load_item()
class CommentarySpider(CrawlSpider):
    name = 'CommentarySpider'
    allowed_domains = ['wallstreetcn.com']
    start_urls = [
        'http://wallstreetcn.com/news?status=published&type=news&order=-created_at&limit=30&page=1',
    ]
    rules = [
        Rule(LxmlLinkExtractor(allow=("page=\d+",))),
        Rule(LxmlLinkExtractor(allow=("node/\d+")),follow=True,callback='parse_commentary'),
    ]
    def parse_commentary(self, response):
        sel = response.selector
        item = CommentaryItem()
        #get uri
        item['uri'] = get_base_url(response)
        print 'Download from uri: %s' % item['uri']
#         log.msg('Download from uri: %s' % item['uri'])
        #get title
        _ = sel.xpath('//h1[@class="article-title"]/text()')
        item['title'] = '' if not _ else _[0].extract()
        #get time
        _ = sel.xpath('//span[@class="item time"]/text()')
        _time = '' if not _ else _[0].extract()
        if not _time:
            item['time'] = None
        else:
            _time = re.sub('[^\u4E00-\u9FA5\s]', '-',_time)
            _time = _time[:10]+'T'+_time[12:]+'Z'
            item['time'] = _time
        #get author
        _ = sel.xpath('//span[@class="item author"]/a/text()')
        item['author'] = '' if not _ else _[0].extract()
        #get description
        _ = sel.xpath('//meta[@name="description"]/@content')
        item['description'] = '' if not _ else _[0].extract()[:-84]
        #get content & imgs & view
        _ = sel.xpath('//div[@class="article-content"]')
        _ = _.extract()[0]
        _view = _[:-123]+'</div>' if len(_) > 200 else _
        _content = BeautifulSoup(_view)
        item['content'] = _content.text
        _image_urls = []
        for img in _content.find_all('img'):
            if img.has_key('src') and img['src'].startswith('http'):
                _image_urls.append(img['src'])
            elif img.has_key('alt') and img['alt'].startswith('http'):
                _image_urls.append(img['alt'])
            else:
                continue
        item['image_urls'] = _image_urls
        #item['image_urls'] = [img.src if img.src is not None and img.src.startswith('http') else img.alt if img.alt is not None else None for img in _content.find_all('img')]
        item['view'] = _view
        return item
                
            
                
        
        
        
Ejemplo n.º 12
0
class IGNSpider(CrawlSpider):  # Cuando es Spider Vertical u Horizatonal
    name = "IGN"
    custom_settings = {
        'USER_AGENT':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
        # Numero maximo de paginas en las cuales voy a descargar items. Scrapy se cierra cuando alcanza este numero
        'CLOSESPIDER_PAGECOUNT': 30
    }

    allowed_domains = ['latam.ign.com']

    start_urls = ['https://latam.ign.com/se/?type=video&q=nintendo%20switch']
    # Tiempo a Esperar entre cada requerimiento que Scrapy haga a la pagina semilla
    download_delay = 1

    rules = (
        # Horizotalidad por Tipo de Informacion
        Rule(LinkExtractor(allow=r'type='), follow=True),
        # Horizantalidad por Paginacion
        Rule(LinkExtractor(allow=r'&page=\d+'), follow=True),
        # Regla por cada tipo de contenido
        # Articulo
        Rule(LinkExtractor(allow=r'/news/'),
             follow=True,
             callback='parse_articulo'),
        # Review
        Rule(LinkExtractor(allow=r'/review/'),
             follow=True,
             callback='parse_review'),
        # Videos
        Rule(LinkExtractor(allow=r'/video/'),
             follow=True,
             callback='parse_video'),
    )

    def parse_articulo(self, response):
        item = ItemLoader(Articulo(), response)
        item.add_xpath('Titulo', './/h1/text()')
        item.add_xpath('Contenido', './/div[@id="id_text"]//*/text()')

        yield item.load_item()

    def parse_review(self, response):
        item = ItemLoader(Reviews(), response)
        item.add_xpath('Titulo', './/div[@class="article-headline"]/h1/text()')
        item.add_xpath(
            'Calificacion',
            '//span[@class="side-wrapper side-wrapper hexagon-content"]/text()'
        )

        yield item.load_item()

    def parse_video(self, response):
        item = ItemLoader(Videos(), response)
        item.add_xpath('Titulo', './/h1/text()')
        item.add_xpath('FechaPublicacion',
                       './/span[@class="publish-date"]/text()')

        yield item.load_item()
Ejemplo n.º 13
0
class CrawlQidianSpider(CrawlSpider):
    name = 'Crawl-qidian'
    allowed_domains = ['www.qidian.com', 'book.qidian.com']
    # start_urls = ['http://www.qidian.com/']
    start_urls = [
        'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'
    ]
    # 定义link_extractor
    # 入口地址和详细页面链接
    main_page = LinkExtractor(
        allow=
        'action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=\d',
        restrict_xpaths='//*[@class="lbf-pagination-next"]')
    page_info = LinkExtractor(allow='//book.qidian.com/info/\d')
    rules = (Rule(main_page, follow=False),
             Rule(page_info, callback='parse_item', follow=True))

    def parse_item(self, response):
        try:
            book_name = response.xpath(
                ".//*[@class='book-info ']/h1/em/text()").extract()[0]
        except:
            book_name = 'noknow'
        try:
            book_author = response.xpath(
                ".//*[@class='book-info ']/h1/span/a/text()").extract()[0]
        except:
            book_author = 'noknow'
        try:
            book_span_tags = response.xpath(
                ".//*[@class='book-info ']/p[@class='tag']/span/text()"
            ).extract()
        except:
            book_span_tags = 'noknow'
        try:
            book_a_tags = response.xpath(
                ".//*[@class='book-info ']/p[@class='tag']/a/text()").extract(
                )
        except:
            book_a_tags = 'unknow'
        try:
            book_intro = response.xpath(
                ".//*[@class='book-info ']/p[@class='intro']/text()").extract(
                )[0]
        except:
            book_intro = 'unknow'
        try:
            book_score_str = response.xpath(".//*[@id='j_bookScore']")
            book_score = book_score_str.xpath(
                ".//span/*[@id='score1']/text()").extract(
                )[0] + book_score_str.xpath(
                    ".//span/em/text()").extract()[0] + book_score_str.xpath(
                        ".//*[@id='score2']/text()").extract()[0]
        except:
            book_score = 'unknow'
        # print(response.body)
        print(book_name, book_author, book_span_tags, book_a_tags, book_intro)
        pass
Ejemplo n.º 14
0
class OgenSpider(CrawlSpider):
    name = 'ogen'
    undetectable = False
    wait = False
    allowed_domains = ['ogen.org.il']
    start_urls = ['https://ogen.org.il/']
    brands = getBrands()
    # ajax_url = 'https://ogen.org.il/wp-admin/admin-ajax.php'
    # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"action\"\r\n\r\nmatat_filter\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"data\"\r\n\r\nminPrice=0&maxPrice=1000\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"security\"\r\n\r\ncb03a93ccd\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    # headers = {
    # 'content-type': "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
    # 'cache-control': "no-cache",
    # 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    # }
    rules = [
        Rule(
            LinkExtractor(allow=('/product-category/')),
            follow=True,
        ),
        Rule(LinkExtractor(allow=('/product/')), callback='parse')
    ]

    # def start_requests(self):
    #     yield scrapy.Request(self.start_urls[0], callback=self.ajax_parse)

    # def ajax_parse(self, response):
    #     result = requests.request("POST", self.ajax_url, data=self.payload, headers=self.headers)
    #     response = HtmlResponse(self.ajax_url, body=result.text, encoding='utf-8')
    #     products = [i for i in response.css("a::attr(href)").extract() if re.search(r"/product/",i)]
    #     for links in products:
    #         yield scrapy.Request(links, callback=self.parse)
    #     return super(OgenSpider, self).start_requests()

    def __init__(self, *args, **kwargs):
        super(OgenSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        description = cleanString(
            response.css("div.short-info p::text").getall())
        title = cleanString(
            response.css("h2.product-name::text").get()) + cleanString(
                response.css("div.price").extract())
        yield CouponsItem(
            Title=title,
            supplier='992',
            brand=filterBrands(
                cleanString(response.css("h2.product-name::text").get()),
                self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)
Ejemplo n.º 15
0
class StackoverflowSpider(CrawlSpider):
    name = "stackoverflow"
    start_urls = []
    allowed_domains = []
    url = ''
    rules = (
        Rule(LinkExtractor(allow=url), callback='parse_asd'),
        Rule(LinkExtractor(allow=url), follow=True),
    )

    def parse_asd(self, response):
        item = QuestionItem()
        for quote in response.css('html'):
            counters_anal = quote.css('script').extract()
            if 'https://www.google-analytics.com/analytics.js' in str(
                    counters_anal):
                yee = 'Yes'
            else:
                yee = 'No'
            if 'mc.yandex.ru/metrika' in str(counters_anal):
                res = 'Yes'
            else:
                res = 'No'
            title = quote.css('title::text').extract_first(),

            item['title'] = Ch.check('title', title)
            description = quote.css(
                'meta[name*=description]::attr(content), meta[name*=Description]::attr(content)'
            ).extract(),
            h1 = quote.css('h1::text').extract(),
            h2 = quote.css('h2::text, H2::text').extract(),
            item['description'] = Ch.check('description', description)
            item['h1'] = Ch.check('h1', h1)
            item['h2'] = Ch.check('h2', h2)
            item['keyword'] = quote.css(
                'meta[name*=Keywords]::attr(content), meta[name*=keywords]::attr(content)'
            ).extract(),
            item['link'] = response.url
            item['text'] = quote.css('p::text, span::text').extract(),
            item['googl_anal'] = yee,
            item['yandex_metrick'] = res,
            return item

    def start_spider(self, url, short_url):

        self.start_urls.append(url)
        self.allowed_domains.append(short_url)
        self.url = url
        settings = get_project_settings()
        configure_logging(settings=settings)
        runner = CrawlerRunner(settings=get_project_settings())
        d = runner.crawl(StackoverflowSpider)
        d.addCallback(lambda response: reactor.stop())
        reactor.callLater(3, d.addCallback, None)
        reactor.run(installSignalHandlers=0)
Ejemplo n.º 16
0
class DinersSpider(CrawlSpider):
    name = 'diners'
    undetectable = True
    wait = True
    elementId = 'cal-shop-brand'
    allowed_domains = ['diners-store.co.il']
    start_urls = ['https://www.diners-store.co.il/']
    brands = getBrands()
    integrator = '-כותרת משנה'
    rules = [
        Rule(LinkExtractor(allow=(), process_value=itemHandler),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=False),
        Rule(LinkExtractor(allow=(), process_value=categoryHandler),
             process_request=my_selenium_request_processor,
             follow=True)
    ]

    def __init__(self, *args, **kwargs):
        super(DinersSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        description = cleanString(
            response.css("div#full-description-text").extract())
        if not description:
            description = cleanString(
                response.css("div.banner-club-big-text-box").extract())
        greenbox = cleanString(
            response.css("h1.productTitle").extract()) + cleanString(
                response.css("div.productSubTitle").extract())
        big_redbox = cleanString(
            response.css("td.product-list-checkboxes").extract())
        if similar(greenbox, big_redbox) > 0.9:
            title = greenbox
        else:
            low_price = re.search(r"'PriceDiscount':\s'\d{1,}'",
                                  str(response.body)).group(0) if re.search(
                                      r"'PriceDiscount':\s'\d{1,}'",
                                      str(response.body)) else ''
            title = greenbox + self.integrator + big_redbox + low_price.replace(
                "'PriceDiscount':", '')
        yield CouponsItem(
            Title=title,
            supplier='16',
            brand=filterBrands(
                cleanString(response.css("h1.productTitle").extract()),
                self.brands),
            JoinUrl=response.url,
            Description=description,
            ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
            DoorToDoorShipping=any(ext in (description + title)
                                   for ext in allowed_shipping_list),
            cyclerun=self.cycleid)
Ejemplo n.º 17
0
class PaisplusSpider(CrawlSpider):
    name = 'pais'
    undetectable = True
    wait = True
    elementId = 'accesability_container'
    allowed_domains = ['paisplus.co.il']
    apiBase = 'https://data.dolcemaster.co.il'
    start_urls = ['https://paisplus.co.il/']
    siteUuid = 'BBAD629F-E549-4612-9EAE-3AA9E85F1C33'
    linkBase = 'https://www.paisplus.co.il/benefits/'
    getBenefitDetails = urllib.parse.urljoin(
        apiBase, f'api/v5_1/public/benefits_details')
    headers = {'Accept': 'Accept: application/json'}
    brands = getBrands()

    rules = [
        Rule(LinkExtractor(allow=('/category/')),
             process_request=my_selenium_request_processor,
             follow=True),
        Rule(LinkExtractor(allow=('/benefits/')),
             callback='parse',
             process_request=my_selenium_request_processor,
             follow=False)
    ]

    def __init__(self, *args, **kwargs):
        super(PaisplusSpider, self).__init__(*args, **kwargs)
        self.cycleid = kwargs.get('cycleid', '')

    def parse(self, response):
        m = re.search(r'https://www\.paisplus\.co\.il/benefits/(.+)/',
                      response.url)
        if m:
            benefit_id = m.group(1)
            formdata = {
                'club_id': f'{self.siteUuid}',
                'benefits_id': f'{benefit_id}'
            }
            r = requests.post(self.getBenefitDetails, json=formdata)
            if r.status_code == 200:
                data = r.json().get('benefits')[0]
                description = cleanString(data['benefits_description'])
                title = cleanString(data['benefits_name'])
                yield CouponsItem(
                    Title=title,
                    supplier='991',
                    brand=filterBrands(description, self.brands),
                    JoinUrl=self.linkBase + data['benefits_id'],
                    Description=description,
                    ScrapeDate=datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
                    DoorToDoorShipping=any(ext in (description + title)
                                           for ext in allowed_shipping_list),
                    cyclerun=self.cycleid)
Ejemplo n.º 18
0
class AshvsashSpider(CrawlSpider):
    name = 'ashvsash'
    allowed_domains = ['m.ashvsash.com']
    start_urls = ['http://m.ashvsash.com/category/电影/']

    rules = [
        Rule(LinkExtractor(allow='page/\d+'), follow=True),
        Rule(LinkExtractor(allow='/\d{4}/\d{2}/\d+'),
             callback='parse_item',
             follow=True,
             process_links='process_links')
    ]

    def __init__(self):
        CrawlSpider.__init__(self)
        self.sqlite_file = SQLITE_FILE
        self.sqlite_table = SQLITE_TABLE
        self.conn = sqlite3.connect(self.sqlite_file)

    def parse_item(self, response):
        item = MoviespiderItem()
        article = response.css('div.article_container')
        name = article.css('h1::text').extract_first()
        image = article.css('.context img::attr(src)').extract_first()
        link = response.url
        ctime = article.css('.article_info .info_date::text').extract_first()
        category = article.css(
            '.article_info .info_category a::text').extract_first()
        description = article.css('div[id=post_content]').extract_first()
        pan = response.css('.context h2').extract()[-1]
        item['name'] = name
        item['image'] = image
        item['link'] = link
        item['ctime'] = ctime
        item['category'] = category
        item['description'] = description
        item['pan'] = pan
        yield item

    def process_links(self, links):
        for link in links:
            url = link.url
            if url.endswith('/#respond'):
                length = len('/#respond')
                url = url[:-length]
            if url.endswith('/'):
                url = url.strip('/')
            cur = self.conn.execute(
                'select count(*) from tb_link where link=?;', (url, ))
            size = cur.fetchone()[0]
            if size == 0:
                yield link
Ejemplo n.º 19
0
class HvrSpider(CrawlSpider):
    name = 'hvr'
    undetectable = True
    elementId = 'wrap'
    wait = True
    allowed_domains = ['hvr.co.il']
    start_urls = []
    signin_url = 'https://hvr.co.il/signin.aspx'
    usrEId,username = '******','052046133'
    pwdEId,password = '******','5167722'
    brands = getBrands()
    headers = {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
    rules = [
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('data-item_id',),process_value=process_item_id),callback="parse_item",follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_item_href),callback="parse_item",follow=False),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_lst),callback="parse_lst",follow=True),
        Rule(LinkExtractor(allow=(),tags=('div','a','area','button'),attrs=('href',),process_value=process_cat),follow=True)
    ]
    

    def start_requests(self):
        yield scrapy.Request(self.signin_url, callback=self.after_login,meta={"selenium":True,"login":True,"elementId":"tz"})

    def after_login(self,response):
        result = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(title)").extract() if re.search(r"(.+?)\.json",i)]
        result2 = [re.search(r"(.+?)\.json",i).group(0) for i in response.css("div::attr(data-json)").extract() if re.search(r"(.+?)\.json",i)]
        for uri in result+result2:
            uri = re.search(r"(?<=\\).*",uri).group(0) if re.search(r"(?<=\\).*",uri) else uri
            r = requests.get('https://www.hvr.co.il/ajax/'+uri,headers=self.headers)
            m = re.findall(r"(page|url)':\s?'(.+?)'",str(r.json())) if r.status_code == 200 else None
            if m:
                for t,s in m:
                    n = re.search(r"(?=home_page\.aspx).*", s)
                    if t == 'page':
                        url = 'https://www.hvr.co.il/home_page.aspx?page=' + s
                    elif t == 'url' and n:
                        url = 'https://www.hvr.co.il/'+n.group(0)
                    self.start_urls.append(url)
        self.start_urls.append(response.url)
        return super(HvrSpider, self).start_requests()


    def parse_lst(self, response):
        template_links = re.findall(r'template_link:\s?"(.+)\d{5,8}"',str(response.body))
        for uri in template_links:
            yield scrapy.Request(urllib.parse.urljoin('https://www.hvr.co.il/',uri), callback=self.parse_item)

    def parse_item(self, response):
        print(response.url)

    
    parse_start_url = parse_item
Ejemplo n.º 20
0
class CrawlSpiderWithErrback(CrawlSpiderWithParseMethod):
    name = 'crawl_spider_with_errback'
    rules = (Rule(LinkExtractor(),
                  callback='parse',
                  errback='errback',
                  follow=True), )

    def start_requests(self):
        test_body = b"""
        <html>
            <head><title>Page title<title></head>
            <body>
                <p><a href="/status?n=200">Item 200</a></p>  <!-- callback -->
                <p><a href="/status?n=201">Item 201</a></p>  <!-- callback -->
                <p><a href="/status?n=404">Item 404</a></p>  <!-- errback -->
                <p><a href="/status?n=500">Item 500</a></p>  <!-- errback -->
                <p><a href="/status?n=501">Item 501</a></p>  <!-- errback -->
            </body>
        </html>
        """
        url = self.mockserver.url("/alpayload")
        yield Request(url, method="POST", body=test_body)

    def errback(self, failure):
        self.logger.info('[errback] status %i', failure.value.response.status)
Ejemplo n.º 21
0
class LiepinSpiderSpider(CrawlSpider):
    name = 'liepin_spider'
    allowed_domains = ['liepin.com']
    start_urls = [
        'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python'
    ]
    # rules中最后一个Rule要有逗号

    rules = (
        Rule(LinkExtractor(allow=r"https://www.liepin.com/job/\d+\.shtml.*",
                           restrict_xpaths=['//ul[@class="sojob-list"]//a']),
             callback="parse_job",
             follow=False),
        # Rule(LinkExtractor(allow=r"zhaopin/.+?curPage=\d+",restrict_xpaths=["//div[@class='pager']//a"]),follow=True)
    )

    def parse_job(self, response):
        title = response.css(".title-info h1::text").get()
        salary = response.css(".job-title-left p::text").get().strip()
        edu = response.css(
            ".job-qualifications span:nth-child(1) ::text").get()
        experience = response.css(
            ".job-qualifications span:nth-child(2) ::text").get()
        work_need_list = response.css(".content-word::text").getall()
        work_need = "".join(work_need_list).strip()
        item = LiepinItem(title=title,
                          salary=salary,
                          edu=edu,
                          experience=experience,
                          work_need=work_need)
        yield item
Ejemplo n.º 22
0
class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/8hr/page/1/']

    rules = [
        # allow:提取符合对应正则表达式的链接,deny相反
        # restrict_xpaths使用xpath和正则表达式共同作用
        # allow_domains,deny_domains
        # follow是否沿着链接往下追踪
        Rule(LinkExtractor(allow='.*?/8hr/page/\d'),
             callback='parse_item',
             follow=True),
    ]

    def parse_item(self, response):
        user_info = response.xpath('//div[@class="author clearfix"]')
        content_info = response.xpath('//div[@class="content"]')

        for li, content in zip(user_info, content_info):

            from simpleDemo.items import QiubaiItem
            item = QiubaiItem()

            item['author'] = li.xpath('.//h2/text()').extract()[0]

            lineContent = ""
            contentList = content.xpath('span/text()').extract()
            for line in contentList:
                lineContent += line

            item['content'] = lineContent

            yield item
Ejemplo n.º 23
0
class CrawlSpiderWithParseMethod(MockServerSpider, CrawlSpider):
    """
    A CrawlSpider which overrides the 'parse' method
    """
    name = 'crawl_spider_with_parse_method'
    custom_settings: dict = {
        'RETRY_HTTP_CODES': [],  # no need to retry
    }
    rules = (Rule(LinkExtractor(), callback='parse', follow=True), )

    def start_requests(self):
        test_body = b"""
        <html>
            <head><title>Page title<title></head>
            <body>
                <p><a href="/status?n=200">Item 200</a></p>  <!-- callback -->
                <p><a href="/status?n=201">Item 201</a></p>  <!-- callback -->
            </body>
        </html>
        """
        url = self.mockserver.url("/alpayload")
        yield Request(url, method="POST", body=test_body)

    def parse(self, response, foo=None):
        self.logger.info('[parse] status %i (foo: %s)', response.status, foo)
        yield Request(self.mockserver.url("/status?n=202"),
                      self.parse,
                      cb_kwargs={"foo": "bar"})
Ejemplo n.º 24
0
class CrawlSpiderWithErrback(MockServerSpider, CrawlSpider):
    name = 'crawl_spider_with_errback'
    custom_settings = {
        'RETRY_HTTP_CODES': [],  # no need to retry
    }
    rules = (Rule(LinkExtractor(),
                  callback='callback',
                  errback='errback',
                  follow=True), )

    def start_requests(self):
        test_body = b"""
        <html>
            <head><title>Page title<title></head>
            <body>
                <p><a href="/status?n=200">Item 200</a></p>  <!-- callback -->
                <p><a href="/status?n=201">Item 201</a></p>  <!-- callback -->
                <p><a href="/status?n=404">Item 404</a></p>  <!-- errback -->
                <p><a href="/status?n=500">Item 500</a></p>  <!-- errback -->
                <p><a href="/status?n=501">Item 501</a></p>  <!-- errback -->
            </body>
        </html>
        """
        url = self.mockserver.url("/alpayload")
        yield Request(url, method="POST", body=test_body)

    def callback(self, response):
        self.logger.info('[callback] status %i', response.status)

    def errback(self, failure):
        self.logger.info('[errback] status %i', failure.value.response.status)
Ejemplo n.º 25
0
class DemoSpider(CrawlSpider):
    name = 'demo'
    allowed_domains = ['www.transfermarkt.com']
    start_urls = ['https://www.transfermarkt.com/statistik/saisontransfers']
    # allowed_domains = ['how2j.cn']
    # start_urls = ['https://how2j.cn/stage/33.html']
    rules = (
        Rule(LinkExtractor(restrict_xpaths=u'//li[@class="naechste-seite"]/a'), callback='parse_next', follow=True),
    )

    def parse_next(self, response):
        html = response.xpath('//div[@class="responsive-table"]/div[@class="grid-view"]/table/tbody/tr')
        # html = response.xpath('//a[@class="list-group-item moduleItemLeft"]/span')
        for each in html:
            item = MyscrapyItem()
            name = each.xpath('./td[2]/table[@class="inline-table"]/tr[1]/td[@class="hauptlink"]/a/text()').extract()
            age = each.xpath('./td[3]/text()').extract()
            value = each.xpath('./td[4]/text()').extract()
            # name = each.xpath('./td[2]/table/tbody/tr[2]/td/text()').extract()
            # name = each.xpath('./text()').extract()
            item['name'] = name[0]
            item['age'] = age[0]
            item['value'] = value[0]
            # item['name'] = '111'
            yield item
Ejemplo n.º 26
0
class RestaurantSpider(CrawlSpider):
    name = "RestaurantSpider"
    allowed_domains = ["domiciliosbogota.com"]
    start_urls = ('http://www.domiciliosbogota.com/', )
    productLinkGetter = ProductLinkGetter()
    rules = [
        Rule(LinkExtractor(allow=(r"http://www\.domiciliosbogota\.com/$")),
             'parseMain')
    ]

    def parseMain(self, response):
        self.restaurantIDsGetter = RestaurantIDsGetter(response)
        linksExtractor = LinkExtractor(
            allow=(r"http\:\/\/www\.domiciliosbogota\.com\/domicilios\-.*"))
        links = linksExtractor.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parseRestaurants)

    def parseRestaurants(self, response):
        sel = RestaurantSelector(response)
        restaurant = Restaurant()
        restaurant["url"] = response.url
        restaurant["name"] = sel.getName()
        restaurant["id"] = self.restaurantIDsGetter.getID(
            "/" + response.url.split("/")[-1])
        restaurant["deliveryTimeInMinutes"] = sel.getDeliveryTimeInMinutes()
        restaurant["minOrderPrice"] = sel.getMinOrderPrice()
        restaurant["deliveryCost"] = sel.getDeliveryCost()
        restaurant["payMethods"] = sel.getPayMethods()
        restaurant["menu"] = sel.getMenuCategories()
        restaurant["tagCategories"] = sel.getTagCategories()
        restaurant["averagePunctuation"] = sel.getAveragePunctuation()
        restaurant["quantityOfComments"] = sel.getQuantityOfComments()
        return restaurant
Ejemplo n.º 27
0
class RustSpider(scrapy.Spider):
    name = "rust"
    allowed_domains = ["academic.oup.com/rheumatology"]
    start_urls = (
        "https://academic.oup.com/rheumatology/list-of-issues/2000?jn=Rheumatology",
    )

    rules = [
        Rule(LinkExtractor(canonicalize=True, unique=True),
             follow=True,
             callback="parse_items")
    ]

    def parse(self, response):
        extractor = LinkExtractor()
        links = extractor.extract_links(response)
        for link in links:
            l = link.url
            yield Request(url=l,
                          callback=self.parse_article,
                          dont_filter=False)

    def parse_article(self, response):
        # if there is a pdf link
        for href in response.css('a[href$=".pdf"]::attr(href)').extract():
            print(href)
            yield (href)
Ejemplo n.º 28
0
class V1Crawler(CrawlSpider):
    name = "v1crawler"

    custom_settings = {
    }

    allowed_domains = [
        'news.baidu.com'
    ]

    start_urls = [
        'http://news.baidu.com'
    ]

    rules = (
        Rule(LinkExtractor(), callback='parse_rsp'),
    )

    @classmethod
    def schedule_runner(cls):
        print("++++++++++++++++++++++")

    # do not override parse method
    def parse_rsp(self, response):
        logger.info("+++++ %s +++++" % response.url)
        """
class FunesSpider(CrawlSpider):
    name = 'funes'
    allowed_domains = []
    handle_httpstatus_all = True
    rules = [
        Rule(LxmlLinkExtractor(allow=(), process_value=formatLink),
             'parse_items',
             follow=True)
    ]

    #     customs_settings = {
    #       'FEED_URI' : '%(domain).csv'
    #      }
    #
    def __init__(self, *args, **kwargs):
        super(FunesSpider, self).__init__(*args, **kwargs)
        #         self.domain = kwargs.pop("domain","")
        self.allowed_domains.append(self.domain)

    def start_requests(self):
        yield Request(url='http://' + self.domain + '/')

    def parse_items(self, response):
        item = HttpScrapperItem()
        item["url"] = response.url
        item["status"] = response.status
        return item
Ejemplo n.º 30
0
Archivo: iata.py Proyecto: aogier/tries
class IataSpider(CrawlSpider):
    '''
    Crawl wikipedia collecting airport data.
    '''
    name = 'iata'
    allowed_domains = ['wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/IATA_airport_code']

    rules = (Rule(LinkExtractor(restrict_xpaths=[
        '//div[@class="mw-parser-output"]'
        '//a[contains(@href, "/wiki/List_of_airports_by_IATA")]'
    ]),
                  callback='get_airport'), )

    def get_airport(self, response):  # pylint: disable=no-self-use, missing-docstring

        for record in response.xpath(
                '//table[contains(@class, "sortable")]//tr[td]'):
            _x = record.xpath
            airport = {}
            airport['iata'] = _x('.//td[1]/text()').extract_first()
            airport['icao'] = _x('.//td[2]/text()').extract_first()
            airport['name'] = ''.join(_x('.//td[3]//text()').extract())
            airport['location'] = ''.join(_x('.//td[4]//text()').extract())
            airport['time'] = _x('.//td[5]//text()').extract_first()
            airport['dst'] = _x('.//td[6]//text()').extract_first()

            # getting rid of empty or '\n' strings
            yield AirportItem(
                **{
                    k: v.strip() if v and v.strip() else None
                    for k, v in airport.items()
                })