Python SgmlLinkExtractorの例、scrapy.linkextractors.sgml.SgmlLinkExtractor Pythonの例

コード例 #1

0

ファイルを表示

ファイル: PoiSpider.py プロジェクト: Silocean/PaperCode

class PoiSpider(CrawlSpider):
    district = '110108'
    name = 'poi'
    allowed_domains = ['poi86.com']
    start_urls = ('http://www.poi86.com/poi/amap/district/' + district +
                  '/1.html', )

    rules = (
        Rule(
            SgmlLinkExtractor(
                allow=(r'http://www.poi86.com/poi/amap/district/' + district +
                       '/\d+.html'))),
        Rule(SgmlLinkExtractor(
            allow=(r'http://www.poi86.com/poi/amap/\d+.html')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = PoiItem()
        # '/html/body/div[2]/div/div[1]/h1'
        item['name'] = response.xpath(
            '/html/body/div[2]/div[1]/div[1]/h1/text()').extract()
        item['address'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract()
        item['category'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract()
        item['wgs_84'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract()
        item['gcj_02'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract()
        item['bd_09'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract()
        yield item

コード例 #2

0

ファイルを表示

ファイル: crawlerBlog.py プロジェクト: zeynepsebnem/pydata_webscraping

class BloggerSpider(CrawlSpider):
    name = "TheHackerWay"
    start_urls = ['http://thehackerway.com']
    # urls desde las cuales el spider comenzará el proceso de crawling
    rules = [
        Rule(SgmlLinkExtractor(allow=[r'/\d{4}']),
             follow=True,
             callback='parse_blog'),
        # r'/\d+' : expression regular para http://thehackerway.com/X URLs
        Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}\d{2}/\w+']),
             callback='parse_blog')
    ]

    # http://thehackerway.com/YYYY/MM/DD/titulo URLs

    def parse_blog(self, response):
        print 'link parseado %s' % response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select(
            '//title/text()').extract()  # Selector XPath para el titulo
        item['author'] = hxs.select(
            "//span[@class='author']/a/text()").extract(
            )  # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()"
                                 ).extract()  # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract(
        )  # Selector XPath para la fecha
        return item  # Retornando el Item.

コード例 #3

0

ファイルを表示

ファイル: airbnb_spider.py プロジェクト: mehrozezahid/rent-websites

class AirbnbSpider(RentBaseSpider):
    name = "airbnb"
    allowed_domains = ["airbnb.com"]
    start_urls = ["https://www.airbnb.com/sitemaps"]

    rules = (
        Rule(
            SgmlLinkExtractor(restrict_xpaths=("//*[@class='sitemap']//a", ))),
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("//*[@class='next next_page']/a", ))),
        Rule(SgmlLinkExtractor(restrict_xpaths=("//*[@itemprop='name']/a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['item_source'] = response.url
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath("//*[@id='listing_name']/text()").extract()[0]

    def get_price(self, response):
        return response.xpath(
            "//*[contains(@class,'book-it__price-amount')]//text()").extract(
            )[0].strip()

コード例 #4

0

ファイルを表示

    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

コード例 #5

0

ファイルを表示

class ListSpider(CrawlSpider):
    #爬虫名称
    name = "tutorial"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["news.cnblogs.com"]
    #开始URl
    start_urls = ["https://news.cnblogs.com"]
    #爬虫规则
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_content'),
    )

    #解析内容
    def parse_content(self, response):
        item = TutorialItem()
        #当前url
        title = response.selector.xpath(
            '//div[@id="news_title"]')[0].extract().decode('utf-8')
        item['title'] = title
        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author
        releasedate = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['releasedate'] = releasedate
        yield item

コード例 #6

0

ファイルを表示

ファイル: douban_crawler.py プロジェクト: yinfeiy/hot_douban

class DoubanCrawler(CrawlSpider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]
    #allowed_domains = ["fling.seas.upenn.edu/"]
    #start_urls = ["https://fling.seas.upenn.edu/~yinfeiy/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(
            r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=', ))),
             Rule(SgmlLinkExtractor(
                 allow=(r'http://movie\.douban\.com/subject/\d+', )),
                  callback='parse_page',
                  follow=True))

    def start_requests(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
        }
        for i, url in enumerate(self.start_urls):
            yield Request(url,
                          cookies={'over18': '1'},
                          callback=self.parse_page,
                          headers=headers)

    def parse_page(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        item['name'] = sel.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()
        item['desc'] = sel.xpath(
            '//div/span[@property="v:summary"]/text()').extract()
        item['url'] = response.url
        return item

コード例 #7

0

ファイルを表示

class HomeawaySpider(RentBaseSpider):
    name = "homeaway"
    allowed_domains = ["homeaway.com"]
    start_urls = ['https://www.homeaway.com/search']

    rules = (
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("(//*[@class='region-refinement'])[6]",
                                 "//*[@class='next']/a"))),
        Rule(SgmlLinkExtractor(restrict_xpaths=(
            "//*[@class='hit-content']//*[@class='hit-headline']//a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath(
            "(//*[@class='container hidden-phone']//h1/text())").extract()[0]

    def get_price(self, response):
        # price for some places is unavailable, only available on request
        price = response.xpath("(//*[@class='price-large']/text())").extract()
        not_available_message = "Available on Inquiry"
        return price[0] if price else not_available_message

コード例 #8

0

ファイルを表示

ファイル: test_linkextractors_deprecated.py プロジェクト: wkt2000/scrapy-1

 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor(deny_extensions="jpg")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])

コード例 #9

0

ファイルを表示

ファイル: test_linkextractors.py プロジェクト: cdingding/scrapy

    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])

コード例 #10

0

ファイルを表示

ファイル: test_linkextractors_deprecated.py プロジェクト: wkt2000/scrapy-1

 def test_attrs_sgml(self):
     html = """<html><area href="sample1.html"></area>
     <a ref="sample2.html">sample text 2</a></html>"""
     response = HtmlResponse("http://example.com/index.html", body=html)
     lx = SgmlLinkExtractor(attrs="href")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/sample1.html', text=u''),
     ])

コード例 #11

0

ファイルを表示

ファイル: __init__.py プロジェクト: epigos/news

 def __init__(self, selector=None, type='css', *args, **kwargs):
     if selector:
         if type not in ['css', 'xpath']:
             raise Exception('Selector type not supported.')
         if type == 'xpath':
             kwargs['restrict_xpaths'] = selector
         else:
             kwargs['restrict_xpaths'] = pyquery.PyQuery('a')._css_to_xpath(selector)
     SgmlLinkExtractor.__init__(self, *args, **kwargs)

コード例 #12

0

ファイルを表示

ファイル: test_linkextractors.py プロジェクト: ylcolala/scrapy

 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])

コード例 #13

0

ファイルを表示

class homespider(CrawlSpider):
    name = 'home'

    allow_domains = ['qd.fang.lianjia.com']
    start_urls = []
    for i in range(1, 48):
        start_urls.append('http://qd.fang.lianjia.com/loupan/pg' + str(i))

    rules = (
        Rule(
            SgmlLinkExtractor(allow=('loupan/p_\w+', ),
                              restrict_xpaths="//div[@class = 'con-box']")),
        Rule(SgmlLinkExtractor(allow=('loupan/p_\w+/xiangqing/')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        torrent = Home_item()
        #=======================================================================
        # deny = ('loupan/p_\w+/xiangce.*', 'loupan/p_\w+/dongtai.*', 'loupan/p_\w+/pinglun.*','loupan/p_\w+/huxingtu.*','loupan/p_\w+/tuijian.*','loupan/p_\w+/peitao.*','loupan/p_\w+/%.*','loupan/p_\w+/xiangqing/%.*','loupan/p_\w+/xiangqing/.+')
        # torrent['name'] = response.xpath("//div[@class = 'col-1']/h2/a/text()").extract()
        # torrent['address'] = response.xpath("//span[@class = 'region']/text()").extract()
        # torrent['price'] = response.xpath("//span[@class = 'num']/text()").extract()
        # torrent['area'] = response.xpath("//div[@class = 'area']/text()").extract()
        # torrent['square'] = response.xpath("//div[@class = 'area']/span/text()").extract()
        #=======================================================================

        torrent['name'] = response.css("div.resb-name::text").extract()
        torrent['price'] = response.css(
            "ul.x-box span.label-val span::text").extract()
        torrent['where'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li/span[@class = 'label-val']/a/text()"
        ).extract()
        torrent['address'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[5]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['sellor'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[7]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['opentime'] = response.css("span.fq-open span::text").extract()
        torrent['gettime'] = response.css(
            "span.fq-handover span::text").extract()
        torrent['alltime'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[3]/li[8]/span[@class = 'label-val']/text()"
        ).extract()

        #  torrent['name'] = response.css("a.clear h1::text").extract()
        #  torrent['address'] = response.css("span.region::text").extract()
        #   torrent['price'] = response.css("p.jiage span.junjia::text").extract()
        # torrent['area'] = response.css("div.area::text").extract()
        #  torrent['square'] = response.css("div.area span::text").extract()
        return torrent

コード例 #14

0

ファイルを表示

ファイル: manta.py プロジェクト: roadt/scrapybot

class MantaSpider(CrawlSpider):
    name = 'manta'
    allowed_domains = ['manta.com']

    rules = (Rule(SgmlLinkExtractor(allow=r'Items/'),
                  callback='parse_item',
                  follow=True),
             Rule(SgmlLinkExtractor(allow=r"/c/[^/]*/[^/]*$"),
                  callback='parse_company_detail',
                  follow=True))

    def __init__(self, term=None, *args, **kwargs):
        super(MantaSpider, self).__init__(*args, **kwargs)
        if term:
            self.start_urls = ['http://www.manta.com/mb?search=%s' % term]
        else:
            self.start_urls = ['http://www.manta.com/']

    def parse_start_url(self, response):
        return self.parse_company(response)

    def parse_search_result(self, response):
        hxs = HtmlXPathSelector(response)
        elems = hxs.select('//a[contains(@class, "nextYes")]/@href').extract()
        if len(elems) >= 1:
            yield Requeset(elems[0], callback=self.parse_company)

    def parse_company(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for h in hxs.select('//div[contains(@class, "pbl")]'):
            c = Company()
            c['name'] = h.select('*/h2[@itemprop="name"]/a/text()').extract()
            c['manta_url'] = h.select(
                '*/h2[@itemprop="name"]/a/@href').extract()
            c['street'] = h.select(
                '*/div[@itemprop="streetAddress"]/text()').extract()
            c['locality'] = h.select(
                '*/div[@itemprop="addressLocality"]/text()').extract()
            c['region'] = h.select(
                '*/div[@itemprop="addressRegion"]/text()').extract()
            c['postal_code'] = h.select(
                '*/div[@itemprop="postalCode"]/text()').extract()
            c['phone'] = h.select(
                '*/div[@itemprop="telephone"]/text()').extract()
            c['website'] = h.select('*/div[@itemprop="url"]/text()').extract()
            yield c

    def parse_company_detail(self, response):
        print(response)

コード例 #15

0

ファイルを表示

ファイル: bak.py プロジェクト: Gzigithub/workspace

 def _init_args(self, **kwargs):
     start_url = kwargs.get('START_URL', '')
     if start_url:
         self.start_urls = [start_url]
     self.rules = (Rule(SgmlLinkExtractor(allow=filter_rules),
                        callback="parse_resp",
                        follow=True,
                        process_links=self.put_links), )
     self.headers = {
         'Host': 'cn.futureelectronics.com',
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
         'Referer': 'http://cn.futureelectronics.com/zh/Pages/index.aspx'
     }
     self.cookies = {
         'SelectedCurrency': 'NY',
         'SelectedLanguage': 'zh-CN',
     }
     # 商品搜索
     self.product_url_pattern_0 = re.compile(filter_rules[0], re.IGNORECASE)
     # 判断是否是商品详情url
     self.product_url_pattern_1 = re.compile(filter_rules[1], re.IGNORECASE)
     self.product_url_pattern_2 = re.compile(filter_rules[2], re.IGNORECASE)
     # 从商品详情url中获取 product_id 作为 goods_sn
     self.product_id_pattern_1 = re.compile(r'ProductID=([^&]+)',
                                            re.IGNORECASE)
     self.product_id_pattern_2 = re.compile(r'/Pages/(.*)\.aspx',
                                            re.IGNORECASE)
     # 每一页的商品数量
     self.limit_num = 10.0

コード例 #16

0

ファイルを表示

ファイル: test_linkextractors_deprecated.py プロジェクト: RexMao/scrapy

 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     <a href="http://google.com/something" rel="external nofollow">Something</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(
         [link for link in lx.extract_links(response)],
         [
             Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True),
             Link(url="http://example.org/about.html", text=u"About us", nofollow=False),
             Link(url="http://google.com/something", text=u"Something", nofollow=True),
         ],
     )

コード例 #17

0

ファイルを表示

ファイル: spider_chapter.py プロジェクト: akens/tutorial

    def __init__(self, book_key, ct, *args, **kwargs):
        self.book_key = book_key
        self.ct = ct
        self.start_urls = ["http://m.88dushu.com/mulu/" + book_key + "-1/"]
        self.rules = (
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/mulu/' + book_key + '-\d+/', ),
                restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
                 follow=True),
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/book/' + book_key + '-\d+/', ),
                restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
                 callback='parse_content',
                 follow=False),
        )

        super(ChapterSpider, self).__init__(*args, **kwargs)

コード例 #18

0

ファイルを表示

class MySpider(CrawlSpider):
    name = "stimdi"
    allowed_domains = ["stimdi.se"]
    start_urls = ["http://www.stimdi.se/tidslinjen/"]

    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="content"]/div/div/h2/a')),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        i = 0
        print i
        for div in response.xpath('//*[@id="content"]/div/div'):
            print "IN FOR"
            item = AfeventItem()
            #Store data into lists
            item['title'] = div.xpath('//h2/a/text()').extract()[i]
            item['url'] = div.xpath('//h2/a/@href').extract()[i]
            item['location'] = ''
            item['description'] = div.xpath(
                '//*[@id="content"]/div/div[1]/a[1]/p/text()').extract()[i]

            #The following code changes the format of the date
            origDate = div.xpath('//p/text()').extract()[i]
            newDate = ''.join(origDate).replace(',', '').split()

            #Assign values to month names
            month = [
                "", "januari", "februari", "mars", "april", "maj", "juni",
                "juli", "augusti", "september", "oktober", "november",
                "december"
            ].index(newDate[1])
            #Assign a "0" in the beginning if month number is < 10
            if month < 10:
                zeroMonth = [0, month]
                zeroMonth = ''.join(map(str, zeroMonth))
            else:
                zeroMonth = month

    #same thing as above with day
            if int(newDate[0]) < 10:
                zeroDate = [0, newDate[0]]
                zeroDate = ''.join(map(str, zeroDate))
            else:
                zeroDate = newDate[0]

    #Puts everything together and stores into item['date']
            finalDate = [newDate[2], zeroMonth, zeroDate]
            item['date'] = '-'.join(finalDate)
            print i

            if i < len(response.xpath('//*[@id="content"]/div/div')):
                print "I IF"
                print len(response.xpath('//*[@id="content"]/div/div'))
                i = i + 1

            yield item

コード例 #19

0

ファイルを表示

ファイル: et2.py プロジェクト: mabo1215/crawlJD

class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [('http://gouwu.sogou.com/shop?query=' + searchWord)
                  for searchWord in lstData().lst]
    link_extractor = {
        'page': SgmlLinkExtractor(allow='/detail/\d+\.html.+'),
        'page_down': SgmlLinkExtractor(
            allow='/shop\?query=.+',
        ),  #restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
        'title': '//p[@class="title"]/a/@title',
        'name':
        '//span[@class="floatR hui61 mt1"]/text()',  #//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
        'price':
        '//span[@class="shopprice font17"]/text()',  # 'price'    :    '//span[@class = "price"]/text()',
    }

    def __init__(self):
        CrawlSpider.__init__(self)
        # use any browser you wish
        self.browser = webdriver.Firefox()

    def __del__(self):
        self.browser.close()

    def parse(self, response):
        #crawl all display page
        for link in self.link_extractor['page_down'].extract_links(response):
            yield Request(url=link.url, callback=self.parse)
        #start browser
        self.browser.get(response.url)
        #loading time interval
        time.sleep(5)
        # get the data and write it to scrapy items
        etaoItem_loader = ItemLoader(item=EtaoItem(), response=response)
        url = str(response.url)
        etaoItem_loader.add_value('url', url)
        etaoItem_loader.add_xpath('title', self._x_query['title'])
        etaoItem_loader.add_xpath('name', self._x_query['name'])
        etaoItem_loader.add_xpath('price', self._x_query['price'])
        yield etaoItem_loader.load_item()

コード例 #20

0

ファイルを表示

class PiaohuaCrawlSpider(CrawlSpider):
    name = "PiaohuaCrawlSpider"
    allowed_domains = ['piaohua.com']
    start_urls = [
        'http://piaohua.com/html/aiqing/index.html',
        'http://piaohua.com/html/kehuan/index.html',
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('list_'),
                               restrict_xpaths=("//div[@class='page']/a")),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        items = []
        sel = Selector(response)
        movie_list = sel.xpath("//div[@id='nml']//dl")
        for movie in movie_list:
            item = PiaohuaItem()
            item['linkurl'] = self.getLinkUrl(movie)
            item['name'] = self.getName(movie)
            item['imageurl'] = self.getImageUrl(movie)
            item['type'] = self.getType(response)

            movieDetail = self.getMovieDetail(item['linkurl'])
            # item['downloadlink'] = self.getDownloadLink(movieDetail)
            # item['updatetime'] = self.getUpdateTime(movieDetail)
            items.append(item)
        return items

    def getLinkUrl(self, site):
        return site.xpath("dt/a/@href").extract()[0]

    def getImageUrl(self, site):
        return site.xpath("dt//img/@src").extract()[0]

    def getName(self, site):
        return site.xpath("dd/strong/a/b/font/text()").extract()[0]

    def getType(self, response):
        return response.url.split('/')[-2]

    def getUpdateTime(self, site):
        str = site.xpath(
            "//div[@id='show']/div[@id='showdesc']/text()").extract()[0]
        return re.search(r'.*(\d{4}-\d{2}-\d{2}).*', str).group(1)

    def getDownloadLink(self, site):
        return site.xpath("//anchor/a/text()").extract()

    def getMovieDetail(self, url):
        url = 'http://piaohua.com' + url
        return Selector(Request(url=url))

コード例 #21

0

ファイルを表示

 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html",
                             body=html,
                             encoding='iso8859-15')
     links = SgmlLinkExtractor(
         restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links, [
         Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC',
              text=u'text')
     ])

コード例 #22

0

ファイルを表示

ファイル: bbc_spider.py プロジェクト: Jasmine1111/bbc_spider

class bbcSpider(CrawlSpider):
    name = "bbc"

    start_urls = ["http://www.bbc.com"]
    download_delay = 2
    handle_httpstatus_list = [301]

    rules = [
        Rule(
            SgmlLinkExtractor(
                allow=(r"http://www.bbc.com/.*?"),
                #  deny=("http:\/\/.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe).*",
                #          "http:\/\/.*#.*",
                #         "https:\/\/www\.bbc\.com\/w\/index\.php\?.*type=signup.*",
                #        "https:\/\/www\.bbc\.com\/w\/index\.php\?.*action=.*",
                #       "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Talk:.*",
                #           "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Category:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special:.*",
                #          "https:\/\/www\.bbc\.com\/sport.*",
                #          "https:\/\/www\.bbc\.com\/weather.*",
                #          "http:\/\/www\.bbc\.com\/earth.*",
                #          "http:\/\/www.bbc.com\/travel.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special%3AUserLogin.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User_talk:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template_talk:.*"
                # ),
                allow_domains=("www.bbc.com")),
            callback='parse_item',
            follow='true')
    ]

    def parse_item(self, response):
        item = BbcItem()
        title_tmp = response.xpath(
            '//*[@id="page"]//h1//text()').extract_first()
        title = title_tmp
        if title:
            title = title.encode('utf8')
        item['title'] = title
        content_tmp = response.xpath(
            '//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//p//text() | //*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//h2//text()'
        ).extract()
        content = ''
        for con in content_tmp:
            if con[-1] == '.':
                con = con + ' '
            content = content + con.encode('utf-8')
        item['content'] = content
        link = str(response.url)
        item['url'] = link.encode('utf-8')
        return item

コード例 #23

0

ファイルを表示

class MySpider(CrawlSpider):
    name = "af"
    allowed_domains = ["afconsult.com"]
    start_urls = [
        "http://www.afconsult.com/sv/jobba-hos-oss/event-seminarier--massor/"
    ]
    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="CalendarContainer"]/div')),
                  callback="parser",
                  follow=True), )

    def parser(self, response):
        i = 0
        for div in response.xpath('//*[@id="CalendarContainer"]/div/div/a'):
            item = AfeventItem()
            print "response.xpath"
            item['title'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/h1/text()'
            ).extract()[i]
            item['venue'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/text()'
            ).extract()[i]
            item['date'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/span/text()'
            ).extract()[i]
            item['time'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/span[2]/span/text()|//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[3]/span/text()[3]'
            ).extract()[i]
            item['url'] = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url_1 = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url = 'http://www.afconsult.com' + follow_url_1
            request = Request(follow_url, callback=self.parse_url)
            request.meta['item'] = item

            if i < len(
                    response.xpath('//*[@id="CalendarContainer"]/div/div/a')):
                i = i + 1
                print i
            yield request

    def parse_url(self, response):
        item = response.meta['item']
        item['description'] = ''.join(
            response.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article//text()'
            ).extract())
        print "parse_url"
        yield item

コード例 #24

0

ファイルを表示

class HwzSpider(CrawlSpider):
    name = "hwz"
    allowed_domains = ["hardwarezone.com.sg"]
    start_urls = [
        "http://forums.hardwarezone.com.sg/current-affairs-lounge-17/"
    ]
    rules = (
        # Extract links matching 'garage-sales-18/.*html' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(SgmlLinkExtractor(allow=('garage\-sales\-18/.*\.html', )), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(
            allow=('current\-affairs\-lounge\-17/.*\.html', )),
             callback='parse_item',
             follow=True),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
    )

    def insert_posts(self, posts):
        return

    """
    When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work.
    """

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        posts = hxs.select("//div[@id='posts']/div[@class='post-wrapper']")
        items = []

        for post in posts:
            item = {}
            item['author_id'] = ''.join(
                post.select(".//a[@class='bigusername']/text()").extract())
            item['url'] = response.url
            item['body'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(".//td[@class='alt1']/div/text()").extract()))
            item['title'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(
                        "//h2[@class='header-gray']/text()").extract()))
            item['date_posted'] = ''.join(
                map(lambda x: x.strip(' \t\n\r#').strip(),
                    post.select(".//td[@class='thead']/text()").extract())
            )  # todo: deal with Today and Yesterday
            # item['date_posted'] = normalizeFriendlyDate(' '.join(map(lambda x:x.strip(' \t\n\r'),post.select(".//td[@class='thead']/text()").extract()))) # todo: deal with Today and Yesterday
            items.append(item)
        # self.insert_posts(items)
        print(items)
        return items

コード例 #25

0

ファイルを表示

 def __init__(self, process_idx, book_class, *args, **kwargs):
     self.idx = int(process_idx)
     self.book_class = book_class
     if self.idx > 0:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-" +
             process_idx + "0/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-' + process_idx + r'[1-9]/'
     else:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-1/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-[1-9]/'
     self.rules = (
         Rule(
             SgmlLinkExtractor(allow=(allow_url, ),
                               restrict_xpaths=('//a[text()="%s"]' %
                                                (self.nextpage2)))),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/info/\d+/', ),
             restrict_xpaths=('//div[@class="block_img"]')),
              callback='parse_book',
              follow=False),
         Rule(SgmlLinkExtractor(allow=(r'http://m.88dushu.com/mulu/\d+/', ),
                                restrict_xpaths=('//a[text()="%s"]' %
                                                 (self.startRead))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/mulu/\d+-\d+/', ),
             restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/book/\d+-\d+/', ),
             restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
              callback='parse_content',
              follow=False),
     )
     super(ListSpider, self).__init__(*args, **kwargs)

コード例 #26

0

ファイルを表示

class QQNewsSpider(CrawlSpider):
    # 爬虫名称
    name = "tutorial"
    # 设置下载延时
    download_delay = 1
    # 允许域名
    allowed_domains = ["news.cnblogs.com"]
    # 开始URL
    start_urls = ["https://news.cnblogs.com"]
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = [
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_item'),
    ]

    # 解析内容函数
    def parse_item(self, response):
        print('***********************')
        item = QqnewsItem()

        # 当前URL
        title = response.selector.xpath(
            '//*[@id="news_title"]/a')[0].extract().decode('utf-8')
        item['title'] = title
        print(title)

        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author

        release_date = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['release_date'] = release_date

        yield item

コード例 #27

0

ファイルを表示

ファイル: cnblogs.py プロジェクト: dscdtc/python_demo_set

class CnblogsSpider(CrawlSpider):
    # 爬虫名称
    name = 'cnblogs'  # 唯一标识，启动spider时即指定该名称
    # 下载延时
    download_delay = 2
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['https://news.cnblogs.com/']
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = (
        # 下面是符合规则的网址,但是不抓取内容,只是提取该页的链接
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/page/\d', ))
             ),
        # 下面是符合规则的网址,提取内容
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse'))

    # 解析内容函数
    def parse(self, response):
        # 当前URL
        for resp in response.selector.xpath('//div[@class="content"]'):
            item = ScrapyspiderItem()

            title = resp.xpath('h2/a/text()').extract()
            item['title'] = title[0].decode('utf-8')

            url = resp.xpath('h2/a/@href').extract()
            item['url'] = 'https://news.cnblogs.com' + url[0].decode('utf-8')

            author = resp.xpath(
                'div[@class="entry_footer"]/a/text()').extract()
            item['author'] = author[0].strip().decode('utf-8')

            date = resp.xpath(
                'div[@class="entry_footer"]/span[@class="gray"]/text()'
            ).extract()
            item['date'] = date[0].decode('utf-8')

            yield item

コード例 #28

0

ファイルを表示

ファイル: Recursive.py プロジェクト: Mr-Perfection/scrap

class   RecursiveScraperSpider(CrawlSpider) :
    name = "rs"
    allowed_domains = ["cse.iitd.ernet.in"]
    start_urls = ["http://www.cse.iitd.ernet.in/~naveen"]
    rules = (
        Rule(SgmlLinkExtractor(allow=("cse\.iltd\.ernet\.in/\~naveen/.*", )), callback='parse_item', follow= True),
        )
    
    def   parse_item(self, response) :
        sel = Selector(response)
        item = RecursivescraperItem()
        item['URL'] = response.request.url
        item['content'] = sel.xpath('/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract()
        return item

コード例 #29

0

ファイルを表示

class CSDNBlogCrawlSpider(CrawlSpider):
    name = "CSDNBlogCrawlSpider"
    allowed_domains = ['blog.csdn.net']
    start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017']
    rules = [
        Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
                               restrict_xpaths=('//li[@class="next_article"]')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        item = {}
        sel = Selector(response)
        blog_url = str(response.url)
        blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['blog_name'] = [n.encode('utf-8') for n in blog_name]
        item['blog_url'] = blog_url.encode('utf-8')

        yield item
    def getImageUrl(self, item):
        imageurl = item.xpath("a[@class='img']/img/@src").extract()
        if imageurl:
            return imageurl[0]
        else:
            return ''

    def getLink(self, item):
        link = item.xpath("a[@class='img']/@href").extract()
        if link:
            return link[0]
        else:
            return ''

    def getUpdateTime(self, item):
        updatetime = item.xpath("span/text()").extract()
        if updatetime:
            return updatetime[0]
        else:
            return item.xpath("span/font/text()").extract()[0]

    def getName(self, item):
        name = item.xpath("a/strong/font/font/text()").extract()
        if name:
            return name[0]
        else:
            return item.xpath("a/strong/font/text()").extract()[0]

コード例 #30

0

ファイルを表示

ファイル: ZhilianSpider.py プロジェクト: sjl421/quick-python

class ZhilianSpider(scrapy.Spider):
    name = "zhilian"
    # allowed_domains =
    start_urls = ["http://jobs.zhaopin.com/bj2140003/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(r'http://jobs.zhaopin.com/[0-9]+.htm', )),
        callback='parse_page',
        follow=True), )

    def parse_page(self, response):
        sel = Selector(response)
        item = ZhiLianItems()
        item['name'] = sel.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()')
        item['company'] = sel.xpath(
            '/html/body/div[5]/div[1]/div[1]/h2/a/text()')
        return item

コード例 #31

0

ファイルを表示

class ArticleSpider(CrawlSpider):
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print("Title is: " + title)
        item['title'] = title
        return item

コード例 #32

0

ファイルを表示

ファイル: spiders.py プロジェクト: josemfc/recopilador_noticias

class NoticiasSpider(CrawlSpider):
	name = 'NoticiasSpider'
	allowed_domains = ['20minutos.es']
	start_urls = ['http://www.20minutos.es/']
	rules = (
		Rule(SgmlLinkExtractor(allow=(r'deportes/noticia/(\w|\d|-|/)*/', )), callback='parse_news', follow=False),
	)

	def parse_news(self, response):
		hxs = HtmlXPathSelector(response)
		elemento = Noticia()

		elemento['titulo'] = hxs.select('//h1[contains(@class, "article-title")]/text()')[0].extract()
		elemento['titulo'] = elemento['titulo'].encode('utf-8')
		elemento['fecha'] = hxs.select('//a[contains(@title, "Noticias del ")]/text()')[0].extract()
		elemento['fecha'] = elemento['fecha'].encode('utf-8')
		elemento['enlace'] = response.url

		return elemento

コード例 #33

0

ファイルを表示

class StackSpider(CrawlSpider):
    name = "stackcrawl"
    allowed_domains = ["stackoverflow.com"]
    start_urls = [
        "http://stackoverflow.com/questions?sort=newest",
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        questions = hxs.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item

コード例 #34

0

ファイルを表示

ファイル: movie_spider.py プロジェクト: shichangtai/ScrapySpider

 def parse(self,response):
     link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+'))
     for i in link_ex.extract_links(response):
         yield Request(i.url,callback=self.parse_item,headers=self.headers)