Ejemplos de SgmlLinkExtractor en Python, ejemplos de scrapy.linkextractors.sgml.SgmlLinkExtractor en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: PoiSpider.py Proyecto: Silocean/PaperCode

class PoiSpider(CrawlSpider):
    district = '110108'
    name = 'poi'
    allowed_domains = ['poi86.com']
    start_urls = ('http://www.poi86.com/poi/amap/district/' + district +
                  '/1.html', )

    rules = (
        Rule(
            SgmlLinkExtractor(
                allow=(r'http://www.poi86.com/poi/amap/district/' + district +
                       '/\d+.html'))),
        Rule(SgmlLinkExtractor(
            allow=(r'http://www.poi86.com/poi/amap/\d+.html')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = PoiItem()
        # '/html/body/div[2]/div/div[1]/h1'
        item['name'] = response.xpath(
            '/html/body/div[2]/div[1]/div[1]/h1/text()').extract()
        item['address'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract()
        item['category'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract()
        item['wgs_84'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract()
        item['gcj_02'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract()
        item['bd_09'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract()
        yield item

Ejemplo n.º 2

0

Mostrar archivo

Archivo: crawlerBlog.py Proyecto: zeynepsebnem/pydata_webscraping

class BloggerSpider(CrawlSpider):
    name = "TheHackerWay"
    start_urls = ['http://thehackerway.com']
    # urls desde las cuales el spider comenzará el proceso de crawling
    rules = [
        Rule(SgmlLinkExtractor(allow=[r'/\d{4}']),
             follow=True,
             callback='parse_blog'),
        # r'/\d+' : expression regular para http://thehackerway.com/X URLs
        Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}\d{2}/\w+']),
             callback='parse_blog')
    ]

    # http://thehackerway.com/YYYY/MM/DD/titulo URLs

    def parse_blog(self, response):
        print 'link parseado %s' % response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select(
            '//title/text()').extract()  # Selector XPath para el titulo
        item['author'] = hxs.select(
            "//span[@class='author']/a/text()").extract(
            )  # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()"
                                 ).extract()  # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract(
        )  # Selector XPath para la fecha
        return item  # Retornando el Item.

Ejemplo n.º 3

0

Mostrar archivo

Archivo: airbnb_spider.py Proyecto: mehrozezahid/rent-websites

class AirbnbSpider(RentBaseSpider):
    name = "airbnb"
    allowed_domains = ["airbnb.com"]
    start_urls = ["https://www.airbnb.com/sitemaps"]

    rules = (
        Rule(
            SgmlLinkExtractor(restrict_xpaths=("//*[@class='sitemap']//a", ))),
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("//*[@class='next next_page']/a", ))),
        Rule(SgmlLinkExtractor(restrict_xpaths=("//*[@itemprop='name']/a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['item_source'] = response.url
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath("//*[@id='listing_name']/text()").extract()[0]

    def get_price(self, response):
        return response.xpath(
            "//*[contains(@class,'book-it__price-amount')]//text()").extract(
            )[0].strip()

Ejemplo n.º 4

0

Mostrar archivo

    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

Ejemplo n.º 5

0

Mostrar archivo

class ListSpider(CrawlSpider):
    #爬虫名称
    name = "tutorial"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["news.cnblogs.com"]
    #开始URl
    start_urls = ["https://news.cnblogs.com"]
    #爬虫规则
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_content'),
    )

    #解析内容
    def parse_content(self, response):
        item = TutorialItem()
        #当前url
        title = response.selector.xpath(
            '//div[@id="news_title"]')[0].extract().decode('utf-8')
        item['title'] = title
        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author
        releasedate = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['releasedate'] = releasedate
        yield item

Ejemplo n.º 6

0

Mostrar archivo

Archivo: douban_crawler.py Proyecto: yinfeiy/hot_douban

class DoubanCrawler(CrawlSpider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]
    #allowed_domains = ["fling.seas.upenn.edu/"]
    #start_urls = ["https://fling.seas.upenn.edu/~yinfeiy/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(
            r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=', ))),
             Rule(SgmlLinkExtractor(
                 allow=(r'http://movie\.douban\.com/subject/\d+', )),
                  callback='parse_page',
                  follow=True))

    def start_requests(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
        }
        for i, url in enumerate(self.start_urls):
            yield Request(url,
                          cookies={'over18': '1'},
                          callback=self.parse_page,
                          headers=headers)

    def parse_page(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        item['name'] = sel.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()
        item['desc'] = sel.xpath(
            '//div/span[@property="v:summary"]/text()').extract()
        item['url'] = response.url
        return item

Ejemplo n.º 7

0

Mostrar archivo

class HomeawaySpider(RentBaseSpider):
    name = "homeaway"
    allowed_domains = ["homeaway.com"]
    start_urls = ['https://www.homeaway.com/search']

    rules = (
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("(//*[@class='region-refinement'])[6]",
                                 "//*[@class='next']/a"))),
        Rule(SgmlLinkExtractor(restrict_xpaths=(
            "//*[@class='hit-content']//*[@class='hit-headline']//a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath(
            "(//*[@class='container hidden-phone']//h1/text())").extract()[0]

    def get_price(self, response):
        # price for some places is unavailable, only available on request
        price = response.xpath("(//*[@class='price-large']/text())").extract()
        not_available_message = "Available on Inquiry"
        return price[0] if price else not_available_message

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_linkextractors_deprecated.py Proyecto: wkt2000/scrapy-1

 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor(deny_extensions="jpg")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_linkextractors.py Proyecto: cdingding/scrapy

    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_linkextractors_deprecated.py Proyecto: wkt2000/scrapy-1

 def test_attrs_sgml(self):
     html = """<html><area href="sample1.html"></area>
     <a ref="sample2.html">sample text 2</a></html>"""
     response = HtmlResponse("http://example.com/index.html", body=html)
     lx = SgmlLinkExtractor(attrs="href")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/sample1.html', text=u''),
     ])

Ejemplo n.º 11

0

Mostrar archivo

Archivo: __init__.py Proyecto: epigos/news

 def __init__(self, selector=None, type='css', *args, **kwargs):
     if selector:
         if type not in ['css', 'xpath']:
             raise Exception('Selector type not supported.')
         if type == 'xpath':
             kwargs['restrict_xpaths'] = selector
         else:
             kwargs['restrict_xpaths'] = pyquery.PyQuery('a')._css_to_xpath(selector)
     SgmlLinkExtractor.__init__(self, *args, **kwargs)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_linkextractors.py Proyecto: ylcolala/scrapy

 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])

Ejemplo n.º 13

0

Mostrar archivo

class homespider(CrawlSpider):
    name = 'home'

    allow_domains = ['qd.fang.lianjia.com']
    start_urls = []
    for i in range(1, 48):
        start_urls.append('http://qd.fang.lianjia.com/loupan/pg' + str(i))

    rules = (
        Rule(
            SgmlLinkExtractor(allow=('loupan/p_\w+', ),
                              restrict_xpaths="//div[@class = 'con-box']")),
        Rule(SgmlLinkExtractor(allow=('loupan/p_\w+/xiangqing/')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        torrent = Home_item()
        #=======================================================================
        # deny = ('loupan/p_\w+/xiangce.*', 'loupan/p_\w+/dongtai.*', 'loupan/p_\w+/pinglun.*','loupan/p_\w+/huxingtu.*','loupan/p_\w+/tuijian.*','loupan/p_\w+/peitao.*','loupan/p_\w+/%.*','loupan/p_\w+/xiangqing/%.*','loupan/p_\w+/xiangqing/.+')
        # torrent['name'] = response.xpath("//div[@class = 'col-1']/h2/a/text()").extract()
        # torrent['address'] = response.xpath("//span[@class = 'region']/text()").extract()
        # torrent['price'] = response.xpath("//span[@class = 'num']/text()").extract()
        # torrent['area'] = response.xpath("//div[@class = 'area']/text()").extract()
        # torrent['square'] = response.xpath("//div[@class = 'area']/span/text()").extract()
        #=======================================================================

        torrent['name'] = response.css("div.resb-name::text").extract()
        torrent['price'] = response.css(
            "ul.x-box span.label-val span::text").extract()
        torrent['where'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li/span[@class = 'label-val']/a/text()"
        ).extract()
        torrent['address'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[5]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['sellor'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[7]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['opentime'] = response.css("span.fq-open span::text").extract()
        torrent['gettime'] = response.css(
            "span.fq-handover span::text").extract()
        torrent['alltime'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[3]/li[8]/span[@class = 'label-val']/text()"
        ).extract()

        #  torrent['name'] = response.css("a.clear h1::text").extract()
        #  torrent['address'] = response.css("span.region::text").extract()
        #   torrent['price'] = response.css("p.jiage span.junjia::text").extract()
        # torrent['area'] = response.css("div.area::text").extract()
        #  torrent['square'] = response.css("div.area span::text").extract()
        return torrent

Ejemplo n.º 14

0

Mostrar archivo

Archivo: manta.py Proyecto: roadt/scrapybot

class MantaSpider(CrawlSpider):
    name = 'manta'
    allowed_domains = ['manta.com']

    rules = (Rule(SgmlLinkExtractor(allow=r'Items/'),
                  callback='parse_item',
                  follow=True),
             Rule(SgmlLinkExtractor(allow=r"/c/[^/]*/[^/]*$"),
                  callback='parse_company_detail',
                  follow=True))

    def __init__(self, term=None, *args, **kwargs):
        super(MantaSpider, self).__init__(*args, **kwargs)
        if term:
            self.start_urls = ['http://www.manta.com/mb?search=%s' % term]
        else:
            self.start_urls = ['http://www.manta.com/']

    def parse_start_url(self, response):
        return self.parse_company(response)

    def parse_search_result(self, response):
        hxs = HtmlXPathSelector(response)
        elems = hxs.select('//a[contains(@class, "nextYes")]/@href').extract()
        if len(elems) >= 1:
            yield Requeset(elems[0], callback=self.parse_company)

    def parse_company(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for h in hxs.select('//div[contains(@class, "pbl")]'):
            c = Company()
            c['name'] = h.select('*/h2[@itemprop="name"]/a/text()').extract()
            c['manta_url'] = h.select(
                '*/h2[@itemprop="name"]/a/@href').extract()
            c['street'] = h.select(
                '*/div[@itemprop="streetAddress"]/text()').extract()
            c['locality'] = h.select(
                '*/div[@itemprop="addressLocality"]/text()').extract()
            c['region'] = h.select(
                '*/div[@itemprop="addressRegion"]/text()').extract()
            c['postal_code'] = h.select(
                '*/div[@itemprop="postalCode"]/text()').extract()
            c['phone'] = h.select(
                '*/div[@itemprop="telephone"]/text()').extract()
            c['website'] = h.select('*/div[@itemprop="url"]/text()').extract()
            yield c

    def parse_company_detail(self, response):
        print(response)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: bak.py Proyecto: Gzigithub/workspace

 def _init_args(self, **kwargs):
     start_url = kwargs.get('START_URL', '')
     if start_url:
         self.start_urls = [start_url]
     self.rules = (Rule(SgmlLinkExtractor(allow=filter_rules),
                        callback="parse_resp",
                        follow=True,
                        process_links=self.put_links), )
     self.headers = {
         'Host': 'cn.futureelectronics.com',
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
         'Referer': 'http://cn.futureelectronics.com/zh/Pages/index.aspx'
     }
     self.cookies = {
         'SelectedCurrency': 'NY',
         'SelectedLanguage': 'zh-CN',
     }
     # 商品搜索
     self.product_url_pattern_0 = re.compile(filter_rules[0], re.IGNORECASE)
     # 判断是否是商品详情url
     self.product_url_pattern_1 = re.compile(filter_rules[1], re.IGNORECASE)
     self.product_url_pattern_2 = re.compile(filter_rules[2], re.IGNORECASE)
     # 从商品详情url中获取 product_id 作为 goods_sn
     self.product_id_pattern_1 = re.compile(r'ProductID=([^&]+)',
                                            re.IGNORECASE)
     self.product_id_pattern_2 = re.compile(r'/Pages/(.*)\.aspx',
                                            re.IGNORECASE)
     # 每一页的商品数量
     self.limit_num = 10.0

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_linkextractors_deprecated.py Proyecto: RexMao/scrapy

 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     <a href="http://google.com/something" rel="external nofollow">Something</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(
         [link for link in lx.extract_links(response)],
         [
             Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True),
             Link(url="http://example.org/about.html", text=u"About us", nofollow=False),
             Link(url="http://google.com/something", text=u"Something", nofollow=True),
         ],
     )

Ejemplo n.º 17

0

Mostrar archivo

Archivo: spider_chapter.py Proyecto: akens/tutorial

    def __init__(self, book_key, ct, *args, **kwargs):
        self.book_key = book_key
        self.ct = ct
        self.start_urls = ["http://m.88dushu.com/mulu/" + book_key + "-1/"]
        self.rules = (
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/mulu/' + book_key + '-\d+/', ),
                restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
                 follow=True),
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/book/' + book_key + '-\d+/', ),
                restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
                 callback='parse_content',
                 follow=False),
        )

        super(ChapterSpider, self).__init__(*args, **kwargs)

Ejemplo n.º 18

0

Mostrar archivo

class MySpider(CrawlSpider):
    name = "stimdi"
    allowed_domains = ["stimdi.se"]
    start_urls = ["http://www.stimdi.se/tidslinjen/"]

    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="content"]/div/div/h2/a')),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        i = 0
        print i
        for div in response.xpath('//*[@id="content"]/div/div'):
            print "IN FOR"
            item = AfeventItem()
            #Store data into lists
            item['title'] = div.xpath('//h2/a/text()').extract()[i]
            item['url'] = div.xpath('//h2/a/@href').extract()[i]
            item['location'] = ''
            item['description'] = div.xpath(
                '//*[@id="content"]/div/div[1]/a[1]/p/text()').extract()[i]

            #The following code changes the format of the date
            origDate = div.xpath('//p/text()').extract()[i]
            newDate = ''.join(origDate).replace(',', '').split()

            #Assign values to month names
            month = [
                "", "januari", "februari", "mars", "april", "maj", "juni",
                "juli", "augusti", "september", "oktober", "november",
                "december"
            ].index(newDate[1])
            #Assign a "0" in the beginning if month number is < 10
            if month < 10:
                zeroMonth = [0, month]
                zeroMonth = ''.join(map(str, zeroMonth))
            else:
                zeroMonth = month

    #same thing as above with day
            if int(newDate[0]) < 10:
                zeroDate = [0, newDate[0]]
                zeroDate = ''.join(map(str, zeroDate))
            else:
                zeroDate = newDate[0]

    #Puts everything together and stores into item['date']
            finalDate = [newDate[2], zeroMonth, zeroDate]
            item['date'] = '-'.join(finalDate)
            print i

            if i < len(response.xpath('//*[@id="content"]/div/div')):
                print "I IF"
                print len(response.xpath('//*[@id="content"]/div/div'))
                i = i + 1

            yield item

Ejemplo n.º 19

0

Mostrar archivo

Archivo: et2.py Proyecto: mabo1215/crawlJD

class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [('http://gouwu.sogou.com/shop?query=' + searchWord)
                  for searchWord in lstData().lst]
    link_extractor = {
        'page': SgmlLinkExtractor(allow='/detail/\d+\.html.+'),
        'page_down': SgmlLinkExtractor(
            allow='/shop\?query=.+',
        ),  #restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
        'title': '//p[@class="title"]/a/@title',
        'name':
        '//span[@class="floatR hui61 mt1"]/text()',  #//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
        'price':
        '//span[@class="shopprice font17"]/text()',  # 'price'    :    '//span[@class = "price"]/text()',
    }

    def __init__(self):
        CrawlSpider.__init__(self)
        # use any browser you wish
        self.browser = webdriver.Firefox()

    def __del__(self):
        self.browser.close()

    def parse(self, response):
        #crawl all display page
        for link in self.link_extractor['page_down'].extract_links(response):
            yield Request(url=link.url, callback=self.parse)
        #start browser
        self.browser.get(response.url)
        #loading time interval
        time.sleep(5)
        # get the data and write it to scrapy items
        etaoItem_loader = ItemLoader(item=EtaoItem(), response=response)
        url = str(response.url)
        etaoItem_loader.add_value('url', url)
        etaoItem_loader.add_xpath('title', self._x_query['title'])
        etaoItem_loader.add_xpath('name', self._x_query['name'])
        etaoItem_loader.add_xpath('price', self._x_query['price'])
        yield etaoItem_loader.load_item()

Ejemplo n.º 20

0

Mostrar archivo

class PiaohuaCrawlSpider(CrawlSpider):
    name = "PiaohuaCrawlSpider"
    allowed_domains = ['piaohua.com']
    start_urls = [
        'http://piaohua.com/html/aiqing/index.html',
        'http://piaohua.com/html/kehuan/index.html',
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('list_'),
                               restrict_xpaths=("//div[@class='page']/a")),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        items = []
        sel = Selector(response)
        movie_list = sel.xpath("//div[@id='nml']//dl")
        for movie in movie_list:
            item = PiaohuaItem()
            item['linkurl'] = self.getLinkUrl(movie)
            item['name'] = self.getName(movie)
            item['imageurl'] = self.getImageUrl(movie)
            item['type'] = self.getType(response)

            movieDetail = self.getMovieDetail(item['linkurl'])
            # item['downloadlink'] = self.getDownloadLink(movieDetail)
            # item['updatetime'] = self.getUpdateTime(movieDetail)
            items.append(item)
        return items

    def getLinkUrl(self, site):
        return site.xpath("dt/a/@href").extract()[0]

    def getImageUrl(self, site):
        return site.xpath("dt//img/@src").extract()[0]

    def getName(self, site):
        return site.xpath("dd/strong/a/b/font/text()").extract()[0]

    def getType(self, response):
        return response.url.split('/')[-2]

    def getUpdateTime(self, site):
        str = site.xpath(
            "//div[@id='show']/div[@id='showdesc']/text()").extract()[0]
        return re.search(r'.*(\d{4}-\d{2}-\d{2}).*', str).group(1)

    def getDownloadLink(self, site):
        return site.xpath("//anchor/a/text()").extract()

    def getMovieDetail(self, url):
        url = 'http://piaohua.com' + url
        return Selector(Request(url=url))

Ejemplo n.º 21

0

Mostrar archivo

 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html",
                             body=html,
                             encoding='iso8859-15')
     links = SgmlLinkExtractor(
         restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links, [
         Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC',
              text=u'text')
     ])

Ejemplo n.º 22

0

Mostrar archivo

Archivo: bbc_spider.py Proyecto: Jasmine1111/bbc_spider

class bbcSpider(CrawlSpider):
    name = "bbc"

    start_urls = ["http://www.bbc.com"]
    download_delay = 2
    handle_httpstatus_list = [301]

    rules = [
        Rule(
            SgmlLinkExtractor(
                allow=(r"http://www.bbc.com/.*?"),
                #  deny=("http:\/\/.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe).*",
                #          "http:\/\/.*#.*",
                #         "https:\/\/www\.bbc\.com\/w\/index\.php\?.*type=signup.*",
                #        "https:\/\/www\.bbc\.com\/w\/index\.php\?.*action=.*",
                #       "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Talk:.*",
                #           "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Category:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special:.*",
                #          "https:\/\/www\.bbc\.com\/sport.*",
                #          "https:\/\/www\.bbc\.com\/weather.*",
                #          "http:\/\/www\.bbc\.com\/earth.*",
                #          "http:\/\/www.bbc.com\/travel.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special%3AUserLogin.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User_talk:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template_talk:.*"
                # ),
                allow_domains=("www.bbc.com")),
            callback='parse_item',
            follow='true')
    ]

    def parse_item(self, response):
        item = BbcItem()
        title_tmp = response.xpath(
            '//*[@id="page"]//h1//text()').extract_first()
        title = title_tmp
        if title:
            title = title.encode('utf8')
        item['title'] = title
        content_tmp = response.xpath(
            '//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//p//text() | //*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//h2//text()'
        ).extract()
        content = ''
        for con in content_tmp:
            if con[-1] == '.':
                con = con + ' '
            content = content + con.encode('utf-8')
        item['content'] = content
        link = str(response.url)
        item['url'] = link.encode('utf-8')
        return item

Ejemplo n.º 23

0

Mostrar archivo

class MySpider(CrawlSpider):
    name = "af"
    allowed_domains = ["afconsult.com"]
    start_urls = [
        "http://www.afconsult.com/sv/jobba-hos-oss/event-seminarier--massor/"
    ]
    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="CalendarContainer"]/div')),
                  callback="parser",
                  follow=True), )

    def parser(self, response):
        i = 0
        for div in response.xpath('//*[@id="CalendarContainer"]/div/div/a'):
            item = AfeventItem()
            print "response.xpath"
            item['title'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/h1/text()'
            ).extract()[i]
            item['venue'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/text()'
            ).extract()[i]
            item['date'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/span/text()'
            ).extract()[i]
            item['time'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/span[2]/span/text()|//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[3]/span/text()[3]'
            ).extract()[i]
            item['url'] = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url_1 = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url = 'http://www.afconsult.com' + follow_url_1
            request = Request(follow_url, callback=self.parse_url)
            request.meta['item'] = item

            if i < len(
                    response.xpath('//*[@id="CalendarContainer"]/div/div/a')):
                i = i + 1
                print i
            yield request

    def parse_url(self, response):
        item = response.meta['item']
        item['description'] = ''.join(
            response.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article//text()'
            ).extract())
        print "parse_url"
        yield item

Ejemplo n.º 24

0

Mostrar archivo

class HwzSpider(CrawlSpider):
    name = "hwz"
    allowed_domains = ["hardwarezone.com.sg"]
    start_urls = [
        "http://forums.hardwarezone.com.sg/current-affairs-lounge-17/"
    ]
    rules = (
        # Extract links matching 'garage-sales-18/.*html' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(SgmlLinkExtractor(allow=('garage\-sales\-18/.*\.html', )), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(
            allow=('current\-affairs\-lounge\-17/.*\.html', )),
             callback='parse_item',
             follow=True),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
    )

    def insert_posts(self, posts):
        return

    """
    When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work.
    """

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        posts = hxs.select("//div[@id='posts']/div[@class='post-wrapper']")
        items = []

        for post in posts:
            item = {}
            item['author_id'] = ''.join(
                post.select(".//a[@class='bigusername']/text()").extract())
            item['url'] = response.url
            item['body'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(".//td[@class='alt1']/div/text()").extract()))
            item['title'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(
                        "//h2[@class='header-gray']/text()").extract()))
            item['date_posted'] = ''.join(
                map(lambda x: x.strip(' \t\n\r#').strip(),
                    post.select(".//td[@class='thead']/text()").extract())
            )  # todo: deal with Today and Yesterday
            # item['date_posted'] = normalizeFriendlyDate(' '.join(map(lambda x:x.strip(' \t\n\r'),post.select(".//td[@class='thead']/text()").extract()))) # todo: deal with Today and Yesterday
            items.append(item)
        # self.insert_posts(items)
        print(items)
        return items

Ejemplo n.º 25

0

Mostrar archivo

 def __init__(self, process_idx, book_class, *args, **kwargs):
     self.idx = int(process_idx)
     self.book_class = book_class
     if self.idx > 0:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-" +
             process_idx + "0/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-' + process_idx + r'[1-9]/'
     else:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-1/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-[1-9]/'
     self.rules = (
         Rule(
             SgmlLinkExtractor(allow=(allow_url, ),
                               restrict_xpaths=('//a[text()="%s"]' %
                                                (self.nextpage2)))),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/info/\d+/', ),
             restrict_xpaths=('//div[@class="block_img"]')),
              callback='parse_book',
              follow=False),
         Rule(SgmlLinkExtractor(allow=(r'http://m.88dushu.com/mulu/\d+/', ),
                                restrict_xpaths=('//a[text()="%s"]' %
                                                 (self.startRead))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/mulu/\d+-\d+/', ),
             restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/book/\d+-\d+/', ),
             restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
              callback='parse_content',
              follow=False),
     )
     super(ListSpider, self).__init__(*args, **kwargs)

Ejemplo n.º 26

0

Mostrar archivo

class QQNewsSpider(CrawlSpider):
    # 爬虫名称
    name = "tutorial"
    # 设置下载延时
    download_delay = 1
    # 允许域名
    allowed_domains = ["news.cnblogs.com"]
    # 开始URL
    start_urls = ["https://news.cnblogs.com"]
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = [
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_item'),
    ]

    # 解析内容函数
    def parse_item(self, response):
        print('***********************')
        item = QqnewsItem()

        # 当前URL
        title = response.selector.xpath(
            '//*[@id="news_title"]/a')[0].extract().decode('utf-8')
        item['title'] = title
        print(title)

        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author

        release_date = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['release_date'] = release_date

        yield item

Ejemplo n.º 27

0

Mostrar archivo

Archivo: cnblogs.py Proyecto: dscdtc/python_demo_set

class CnblogsSpider(CrawlSpider):
    # 爬虫名称
    name = 'cnblogs'  # 唯一标识，启动spider时即指定该名称
    # 下载延时
    download_delay = 2
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['https://news.cnblogs.com/']
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = (
        # 下面是符合规则的网址,但是不抓取内容,只是提取该页的链接
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/page/\d', ))
             ),
        # 下面是符合规则的网址,提取内容
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse'))

    # 解析内容函数
    def parse(self, response):
        # 当前URL
        for resp in response.selector.xpath('//div[@class="content"]'):
            item = ScrapyspiderItem()

            title = resp.xpath('h2/a/text()').extract()
            item['title'] = title[0].decode('utf-8')

            url = resp.xpath('h2/a/@href').extract()
            item['url'] = 'https://news.cnblogs.com' + url[0].decode('utf-8')

            author = resp.xpath(
                'div[@class="entry_footer"]/a/text()').extract()
            item['author'] = author[0].strip().decode('utf-8')

            date = resp.xpath(
                'div[@class="entry_footer"]/span[@class="gray"]/text()'
            ).extract()
            item['date'] = date[0].decode('utf-8')

            yield item

Ejemplo n.º 28

0

Mostrar archivo

Archivo: Recursive.py Proyecto: Mr-Perfection/scrap

class   RecursiveScraperSpider(CrawlSpider) :
    name = "rs"
    allowed_domains = ["cse.iitd.ernet.in"]
    start_urls = ["http://www.cse.iitd.ernet.in/~naveen"]
    rules = (
        Rule(SgmlLinkExtractor(allow=("cse\.iltd\.ernet\.in/\~naveen/.*", )), callback='parse_item', follow= True),
        )
    
    def   parse_item(self, response) :
        sel = Selector(response)
        item = RecursivescraperItem()
        item['URL'] = response.request.url
        item['content'] = sel.xpath('/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract()
        return item

Ejemplo n.º 29

0

Mostrar archivo

class CSDNBlogCrawlSpider(CrawlSpider):
    name = "CSDNBlogCrawlSpider"
    allowed_domains = ['blog.csdn.net']
    start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017']
    rules = [
        Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
                               restrict_xpaths=('//li[@class="next_article"]')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        item = {}
        sel = Selector(response)
        blog_url = str(response.url)
        blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['blog_name'] = [n.encode('utf-8') for n in blog_name]
        item['blog_url'] = blog_url.encode('utf-8')

        yield item
    def getImageUrl(self, item):
        imageurl = item.xpath("a[@class='img']/img/@src").extract()
        if imageurl:
            return imageurl[0]
        else:
            return ''

    def getLink(self, item):
        link = item.xpath("a[@class='img']/@href").extract()
        if link:
            return link[0]
        else:
            return ''

    def getUpdateTime(self, item):
        updatetime = item.xpath("span/text()").extract()
        if updatetime:
            return updatetime[0]
        else:
            return item.xpath("span/font/text()").extract()[0]

    def getName(self, item):
        name = item.xpath("a/strong/font/font/text()").extract()
        if name:
            return name[0]
        else:
            return item.xpath("a/strong/font/text()").extract()[0]

Ejemplo n.º 30

0

Mostrar archivo

Archivo: ZhilianSpider.py Proyecto: sjl421/quick-python

class ZhilianSpider(scrapy.Spider):
    name = "zhilian"
    # allowed_domains =
    start_urls = ["http://jobs.zhaopin.com/bj2140003/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(r'http://jobs.zhaopin.com/[0-9]+.htm', )),
        callback='parse_page',
        follow=True), )

    def parse_page(self, response):
        sel = Selector(response)
        item = ZhiLianItems()
        item['name'] = sel.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()')
        item['company'] = sel.xpath(
            '/html/body/div[5]/div[1]/div[1]/h2/a/text()')
        return item

Ejemplo n.º 31

0

Mostrar archivo

class ArticleSpider(CrawlSpider):
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print("Title is: " + title)
        item['title'] = title
        return item

Ejemplo n.º 32

0

Mostrar archivo

Archivo: spiders.py Proyecto: josemfc/recopilador_noticias

class NoticiasSpider(CrawlSpider):
	name = 'NoticiasSpider'
	allowed_domains = ['20minutos.es']
	start_urls = ['http://www.20minutos.es/']
	rules = (
		Rule(SgmlLinkExtractor(allow=(r'deportes/noticia/(\w|\d|-|/)*/', )), callback='parse_news', follow=False),
	)

	def parse_news(self, response):
		hxs = HtmlXPathSelector(response)
		elemento = Noticia()

		elemento['titulo'] = hxs.select('//h1[contains(@class, "article-title")]/text()')[0].extract()
		elemento['titulo'] = elemento['titulo'].encode('utf-8')
		elemento['fecha'] = hxs.select('//a[contains(@title, "Noticias del ")]/text()')[0].extract()
		elemento['fecha'] = elemento['fecha'].encode('utf-8')
		elemento['enlace'] = response.url

		return elemento

Ejemplo n.º 33

0

Mostrar archivo

class StackSpider(CrawlSpider):
    name = "stackcrawl"
    allowed_domains = ["stackoverflow.com"]
    start_urls = [
        "http://stackoverflow.com/questions?sort=newest",
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        questions = hxs.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item

Ejemplo n.º 34

0

Mostrar archivo

Archivo: movie_spider.py Proyecto: shichangtai/ScrapySpider

 def parse(self,response):
     link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+'))
     for i in link_ex.extract_links(response):
         yield Request(i.url,callback=self.parse_item,headers=self.headers)