コード例 #1
0
ファイル: PoiSpider.py プロジェクト: Silocean/PaperCode
class PoiSpider(CrawlSpider):
    district = '110108'
    name = 'poi'
    allowed_domains = ['poi86.com']
    start_urls = ('http://www.poi86.com/poi/amap/district/' + district +
                  '/1.html', )

    rules = (
        Rule(
            SgmlLinkExtractor(
                allow=(r'http://www.poi86.com/poi/amap/district/' + district +
                       '/\d+.html'))),
        Rule(SgmlLinkExtractor(
            allow=(r'http://www.poi86.com/poi/amap/\d+.html')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = PoiItem()
        # '/html/body/div[2]/div/div[1]/h1'
        item['name'] = response.xpath(
            '/html/body/div[2]/div[1]/div[1]/h1/text()').extract()
        item['address'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract()
        item['category'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract()
        item['wgs_84'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract()
        item['gcj_02'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract()
        item['bd_09'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract()
        yield item
コード例 #2
0
class BloggerSpider(CrawlSpider):
    name = "TheHackerWay"
    start_urls = ['http://thehackerway.com']
    # urls desde las cuales el spider comenzará el proceso de crawling
    rules = [
        Rule(SgmlLinkExtractor(allow=[r'/\d{4}']),
             follow=True,
             callback='parse_blog'),
        # r'/\d+' : expression regular para http://thehackerway.com/X URLs
        Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}\d{2}/\w+']),
             callback='parse_blog')
    ]

    # http://thehackerway.com/YYYY/MM/DD/titulo URLs

    def parse_blog(self, response):
        print 'link parseado %s' % response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select(
            '//title/text()').extract()  # Selector XPath para el titulo
        item['author'] = hxs.select(
            "//span[@class='author']/a/text()").extract(
            )  # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()"
                                 ).extract()  # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract(
        )  # Selector XPath para la fecha
        return item  # Retornando el Item.
コード例 #3
0
class AirbnbSpider(RentBaseSpider):
    name = "airbnb"
    allowed_domains = ["airbnb.com"]
    start_urls = ["https://www.airbnb.com/sitemaps"]

    rules = (
        Rule(
            SgmlLinkExtractor(restrict_xpaths=("//*[@class='sitemap']//a", ))),
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("//*[@class='next next_page']/a", ))),
        Rule(SgmlLinkExtractor(restrict_xpaths=("//*[@itemprop='name']/a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['item_source'] = response.url
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath("//*[@id='listing_name']/text()").extract()[0]

    def get_price(self, response):
        return response.xpath(
            "//*[contains(@class,'book-it__price-amount')]//text()").extract(
            )[0].strip()
コード例 #4
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
コード例 #5
0
class ListSpider(CrawlSpider):
    #爬虫名称
    name = "tutorial"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["news.cnblogs.com"]
    #开始URl
    start_urls = ["https://news.cnblogs.com"]
    #爬虫规则
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_content'),
    )

    #解析内容
    def parse_content(self, response):
        item = TutorialItem()
        #当前url
        title = response.selector.xpath(
            '//div[@id="news_title"]')[0].extract().decode('utf-8')
        item['title'] = title
        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author
        releasedate = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['releasedate'] = releasedate
        yield item
コード例 #6
0
ファイル: douban_crawler.py プロジェクト: yinfeiy/hot_douban
class DoubanCrawler(CrawlSpider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]
    #allowed_domains = ["fling.seas.upenn.edu/"]
    #start_urls = ["https://fling.seas.upenn.edu/~yinfeiy/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(
            r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=', ))),
             Rule(SgmlLinkExtractor(
                 allow=(r'http://movie\.douban\.com/subject/\d+', )),
                  callback='parse_page',
                  follow=True))

    def start_requests(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
        }
        for i, url in enumerate(self.start_urls):
            yield Request(url,
                          cookies={'over18': '1'},
                          callback=self.parse_page,
                          headers=headers)

    def parse_page(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        item['name'] = sel.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()
        item['desc'] = sel.xpath(
            '//div/span[@property="v:summary"]/text()').extract()
        item['url'] = response.url
        return item
コード例 #7
0
class HomeawaySpider(RentBaseSpider):
    name = "homeaway"
    allowed_domains = ["homeaway.com"]
    start_urls = ['https://www.homeaway.com/search']

    rules = (
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("(//*[@class='region-refinement'])[6]",
                                 "//*[@class='next']/a"))),
        Rule(SgmlLinkExtractor(restrict_xpaths=(
            "//*[@class='hit-content']//*[@class='hit-headline']//a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath(
            "(//*[@class='container hidden-phone']//h1/text())").extract()[0]

    def get_price(self, response):
        # price for some places is unavailable, only available on request
        price = response.xpath("(//*[@class='price-large']/text())").extract()
        not_available_message = "Available on Inquiry"
        return price[0] if price else not_available_message
コード例 #8
0
 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor(deny_extensions="jpg")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])
コード例 #9
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
コード例 #10
0
 def test_attrs_sgml(self):
     html = """<html><area href="sample1.html"></area>
     <a ref="sample2.html">sample text 2</a></html>"""
     response = HtmlResponse("http://example.com/index.html", body=html)
     lx = SgmlLinkExtractor(attrs="href")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/sample1.html', text=u''),
     ])
コード例 #11
0
ファイル: __init__.py プロジェクト: epigos/news
 def __init__(self, selector=None, type='css', *args, **kwargs):
     if selector:
         if type not in ['css', 'xpath']:
             raise Exception('Selector type not supported.')
         if type == 'xpath':
             kwargs['restrict_xpaths'] = selector
         else:
             kwargs['restrict_xpaths'] = pyquery.PyQuery('a')._css_to_xpath(selector)
     SgmlLinkExtractor.__init__(self, *args, **kwargs)
コード例 #12
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])
コード例 #13
0
class homespider(CrawlSpider):
    name = 'home'

    allow_domains = ['qd.fang.lianjia.com']
    start_urls = []
    for i in range(1, 48):
        start_urls.append('http://qd.fang.lianjia.com/loupan/pg' + str(i))

    rules = (
        Rule(
            SgmlLinkExtractor(allow=('loupan/p_\w+', ),
                              restrict_xpaths="//div[@class = 'con-box']")),
        Rule(SgmlLinkExtractor(allow=('loupan/p_\w+/xiangqing/')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        torrent = Home_item()
        #=======================================================================
        # deny = ('loupan/p_\w+/xiangce.*', 'loupan/p_\w+/dongtai.*', 'loupan/p_\w+/pinglun.*','loupan/p_\w+/huxingtu.*','loupan/p_\w+/tuijian.*','loupan/p_\w+/peitao.*','loupan/p_\w+/%.*','loupan/p_\w+/xiangqing/%.*','loupan/p_\w+/xiangqing/.+')
        # torrent['name'] = response.xpath("//div[@class = 'col-1']/h2/a/text()").extract()
        # torrent['address'] = response.xpath("//span[@class = 'region']/text()").extract()
        # torrent['price'] = response.xpath("//span[@class = 'num']/text()").extract()
        # torrent['area'] = response.xpath("//div[@class = 'area']/text()").extract()
        # torrent['square'] = response.xpath("//div[@class = 'area']/span/text()").extract()
        #=======================================================================

        torrent['name'] = response.css("div.resb-name::text").extract()
        torrent['price'] = response.css(
            "ul.x-box span.label-val span::text").extract()
        torrent['where'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li/span[@class = 'label-val']/a/text()"
        ).extract()
        torrent['address'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[5]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['sellor'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[7]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['opentime'] = response.css("span.fq-open span::text").extract()
        torrent['gettime'] = response.css(
            "span.fq-handover span::text").extract()
        torrent['alltime'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[3]/li[8]/span[@class = 'label-val']/text()"
        ).extract()

        #  torrent['name'] = response.css("a.clear h1::text").extract()
        #  torrent['address'] = response.css("span.region::text").extract()
        #   torrent['price'] = response.css("p.jiage span.junjia::text").extract()
        # torrent['area'] = response.css("div.area::text").extract()
        #  torrent['square'] = response.css("div.area span::text").extract()
        return torrent
コード例 #14
0
ファイル: manta.py プロジェクト: roadt/scrapybot
class MantaSpider(CrawlSpider):
    name = 'manta'
    allowed_domains = ['manta.com']

    rules = (Rule(SgmlLinkExtractor(allow=r'Items/'),
                  callback='parse_item',
                  follow=True),
             Rule(SgmlLinkExtractor(allow=r"/c/[^/]*/[^/]*$"),
                  callback='parse_company_detail',
                  follow=True))

    def __init__(self, term=None, *args, **kwargs):
        super(MantaSpider, self).__init__(*args, **kwargs)
        if term:
            self.start_urls = ['http://www.manta.com/mb?search=%s' % term]
        else:
            self.start_urls = ['http://www.manta.com/']

    def parse_start_url(self, response):
        return self.parse_company(response)

    def parse_search_result(self, response):
        hxs = HtmlXPathSelector(response)
        elems = hxs.select('//a[contains(@class, "nextYes")]/@href').extract()
        if len(elems) >= 1:
            yield Requeset(elems[0], callback=self.parse_company)

    def parse_company(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for h in hxs.select('//div[contains(@class, "pbl")]'):
            c = Company()
            c['name'] = h.select('*/h2[@itemprop="name"]/a/text()').extract()
            c['manta_url'] = h.select(
                '*/h2[@itemprop="name"]/a/@href').extract()
            c['street'] = h.select(
                '*/div[@itemprop="streetAddress"]/text()').extract()
            c['locality'] = h.select(
                '*/div[@itemprop="addressLocality"]/text()').extract()
            c['region'] = h.select(
                '*/div[@itemprop="addressRegion"]/text()').extract()
            c['postal_code'] = h.select(
                '*/div[@itemprop="postalCode"]/text()').extract()
            c['phone'] = h.select(
                '*/div[@itemprop="telephone"]/text()').extract()
            c['website'] = h.select('*/div[@itemprop="url"]/text()').extract()
            yield c

    def parse_company_detail(self, response):
        print(response)
コード例 #15
0
ファイル: bak.py プロジェクト: Gzigithub/workspace
 def _init_args(self, **kwargs):
     start_url = kwargs.get('START_URL', '')
     if start_url:
         self.start_urls = [start_url]
     self.rules = (Rule(SgmlLinkExtractor(allow=filter_rules),
                        callback="parse_resp",
                        follow=True,
                        process_links=self.put_links), )
     self.headers = {
         'Host': 'cn.futureelectronics.com',
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
         'Referer': 'http://cn.futureelectronics.com/zh/Pages/index.aspx'
     }
     self.cookies = {
         'SelectedCurrency': 'NY',
         'SelectedLanguage': 'zh-CN',
     }
     # 商品搜索
     self.product_url_pattern_0 = re.compile(filter_rules[0], re.IGNORECASE)
     # 判断是否是商品详情url
     self.product_url_pattern_1 = re.compile(filter_rules[1], re.IGNORECASE)
     self.product_url_pattern_2 = re.compile(filter_rules[2], re.IGNORECASE)
     # 从商品详情url中获取 product_id 作为 goods_sn
     self.product_id_pattern_1 = re.compile(r'ProductID=([^&]+)',
                                            re.IGNORECASE)
     self.product_id_pattern_2 = re.compile(r'/Pages/(.*)\.aspx',
                                            re.IGNORECASE)
     # 每一页的商品数量
     self.limit_num = 10.0
コード例 #16
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     <a href="http://google.com/something" rel="external nofollow">Something</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(
         [link for link in lx.extract_links(response)],
         [
             Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True),
             Link(url="http://example.org/about.html", text=u"About us", nofollow=False),
             Link(url="http://google.com/something", text=u"Something", nofollow=True),
         ],
     )
コード例 #17
0
ファイル: spider_chapter.py プロジェクト: akens/tutorial
    def __init__(self, book_key, ct, *args, **kwargs):
        self.book_key = book_key
        self.ct = ct
        self.start_urls = ["http://m.88dushu.com/mulu/" + book_key + "-1/"]
        self.rules = (
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/mulu/' + book_key + '-\d+/', ),
                restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
                 follow=True),
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/book/' + book_key + '-\d+/', ),
                restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
                 callback='parse_content',
                 follow=False),
        )

        super(ChapterSpider, self).__init__(*args, **kwargs)
コード例 #18
0
class MySpider(CrawlSpider):
    name = "stimdi"
    allowed_domains = ["stimdi.se"]
    start_urls = ["http://www.stimdi.se/tidslinjen/"]

    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="content"]/div/div/h2/a')),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        i = 0
        print i
        for div in response.xpath('//*[@id="content"]/div/div'):
            print "IN FOR"
            item = AfeventItem()
            #Store data into lists
            item['title'] = div.xpath('//h2/a/text()').extract()[i]
            item['url'] = div.xpath('//h2/a/@href').extract()[i]
            item['location'] = ''
            item['description'] = div.xpath(
                '//*[@id="content"]/div/div[1]/a[1]/p/text()').extract()[i]

            #The following code changes the format of the date
            origDate = div.xpath('//p/text()').extract()[i]
            newDate = ''.join(origDate).replace(',', '').split()

            #Assign values to month names
            month = [
                "", "januari", "februari", "mars", "april", "maj", "juni",
                "juli", "augusti", "september", "oktober", "november",
                "december"
            ].index(newDate[1])
            #Assign a "0" in the beginning if month number is < 10
            if month < 10:
                zeroMonth = [0, month]
                zeroMonth = ''.join(map(str, zeroMonth))
            else:
                zeroMonth = month

    #same thing as above with day
            if int(newDate[0]) < 10:
                zeroDate = [0, newDate[0]]
                zeroDate = ''.join(map(str, zeroDate))
            else:
                zeroDate = newDate[0]

    #Puts everything together and stores into item['date']
            finalDate = [newDate[2], zeroMonth, zeroDate]
            item['date'] = '-'.join(finalDate)
            print i

            if i < len(response.xpath('//*[@id="content"]/div/div')):
                print "I IF"
                print len(response.xpath('//*[@id="content"]/div/div'))
                i = i + 1

            yield item
コード例 #19
0
ファイル: et2.py プロジェクト: mabo1215/crawlJD
class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [('http://gouwu.sogou.com/shop?query=' + searchWord)
                  for searchWord in lstData().lst]
    link_extractor = {
        'page': SgmlLinkExtractor(allow='/detail/\d+\.html.+'),
        'page_down': SgmlLinkExtractor(
            allow='/shop\?query=.+',
        ),  #restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
        'title': '//p[@class="title"]/a/@title',
        'name':
        '//span[@class="floatR hui61 mt1"]/text()',  #//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
        'price':
        '//span[@class="shopprice font17"]/text()',  # 'price'    :    '//span[@class = "price"]/text()',
    }

    def __init__(self):
        CrawlSpider.__init__(self)
        # use any browser you wish
        self.browser = webdriver.Firefox()

    def __del__(self):
        self.browser.close()

    def parse(self, response):
        #crawl all display page
        for link in self.link_extractor['page_down'].extract_links(response):
            yield Request(url=link.url, callback=self.parse)
        #start browser
        self.browser.get(response.url)
        #loading time interval
        time.sleep(5)
        # get the data and write it to scrapy items
        etaoItem_loader = ItemLoader(item=EtaoItem(), response=response)
        url = str(response.url)
        etaoItem_loader.add_value('url', url)
        etaoItem_loader.add_xpath('title', self._x_query['title'])
        etaoItem_loader.add_xpath('name', self._x_query['name'])
        etaoItem_loader.add_xpath('price', self._x_query['price'])
        yield etaoItem_loader.load_item()
コード例 #20
0
class PiaohuaCrawlSpider(CrawlSpider):
    name = "PiaohuaCrawlSpider"
    allowed_domains = ['piaohua.com']
    start_urls = [
        'http://piaohua.com/html/aiqing/index.html',
        'http://piaohua.com/html/kehuan/index.html',
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('list_'),
                               restrict_xpaths=("//div[@class='page']/a")),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        items = []
        sel = Selector(response)
        movie_list = sel.xpath("//div[@id='nml']//dl")
        for movie in movie_list:
            item = PiaohuaItem()
            item['linkurl'] = self.getLinkUrl(movie)
            item['name'] = self.getName(movie)
            item['imageurl'] = self.getImageUrl(movie)
            item['type'] = self.getType(response)

            movieDetail = self.getMovieDetail(item['linkurl'])
            # item['downloadlink'] = self.getDownloadLink(movieDetail)
            # item['updatetime'] = self.getUpdateTime(movieDetail)
            items.append(item)
        return items

    def getLinkUrl(self, site):
        return site.xpath("dt/a/@href").extract()[0]

    def getImageUrl(self, site):
        return site.xpath("dt//img/@src").extract()[0]

    def getName(self, site):
        return site.xpath("dd/strong/a/b/font/text()").extract()[0]

    def getType(self, response):
        return response.url.split('/')[-2]

    def getUpdateTime(self, site):
        str = site.xpath(
            "//div[@id='show']/div[@id='showdesc']/text()").extract()[0]
        return re.search(r'.*(\d{4}-\d{2}-\d{2}).*', str).group(1)

    def getDownloadLink(self, site):
        return site.xpath("//anchor/a/text()").extract()

    def getMovieDetail(self, url):
        url = 'http://piaohua.com' + url
        return Selector(Request(url=url))
コード例 #21
0
 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html",
                             body=html,
                             encoding='iso8859-15')
     links = SgmlLinkExtractor(
         restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links, [
         Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC',
              text=u'text')
     ])
コード例 #22
0
ファイル: bbc_spider.py プロジェクト: Jasmine1111/bbc_spider
class bbcSpider(CrawlSpider):
    name = "bbc"

    start_urls = ["http://www.bbc.com"]
    download_delay = 2
    handle_httpstatus_list = [301]

    rules = [
        Rule(
            SgmlLinkExtractor(
                allow=(r"http://www.bbc.com/.*?"),
                #  deny=("http:\/\/.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe).*",
                #          "http:\/\/.*#.*",
                #         "https:\/\/www\.bbc\.com\/w\/index\.php\?.*type=signup.*",
                #        "https:\/\/www\.bbc\.com\/w\/index\.php\?.*action=.*",
                #       "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Talk:.*",
                #           "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Category:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special:.*",
                #          "https:\/\/www\.bbc\.com\/sport.*",
                #          "https:\/\/www\.bbc\.com\/weather.*",
                #          "http:\/\/www\.bbc\.com\/earth.*",
                #          "http:\/\/www.bbc.com\/travel.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special%3AUserLogin.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User_talk:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template_talk:.*"
                # ),
                allow_domains=("www.bbc.com")),
            callback='parse_item',
            follow='true')
    ]

    def parse_item(self, response):
        item = BbcItem()
        title_tmp = response.xpath(
            '//*[@id="page"]//h1//text()').extract_first()
        title = title_tmp
        if title:
            title = title.encode('utf8')
        item['title'] = title
        content_tmp = response.xpath(
            '//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//p//text() | //*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//h2//text()'
        ).extract()
        content = ''
        for con in content_tmp:
            if con[-1] == '.':
                con = con + ' '
            content = content + con.encode('utf-8')
        item['content'] = content
        link = str(response.url)
        item['url'] = link.encode('utf-8')
        return item
コード例 #23
0
class MySpider(CrawlSpider):
    name = "af"
    allowed_domains = ["afconsult.com"]
    start_urls = [
        "http://www.afconsult.com/sv/jobba-hos-oss/event-seminarier--massor/"
    ]
    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="CalendarContainer"]/div')),
                  callback="parser",
                  follow=True), )

    def parser(self, response):
        i = 0
        for div in response.xpath('//*[@id="CalendarContainer"]/div/div/a'):
            item = AfeventItem()
            print "response.xpath"
            item['title'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/h1/text()'
            ).extract()[i]
            item['venue'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/text()'
            ).extract()[i]
            item['date'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/span/text()'
            ).extract()[i]
            item['time'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/span[2]/span/text()|//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[3]/span/text()[3]'
            ).extract()[i]
            item['url'] = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url_1 = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url = 'http://www.afconsult.com' + follow_url_1
            request = Request(follow_url, callback=self.parse_url)
            request.meta['item'] = item

            if i < len(
                    response.xpath('//*[@id="CalendarContainer"]/div/div/a')):
                i = i + 1
                print i
            yield request

    def parse_url(self, response):
        item = response.meta['item']
        item['description'] = ''.join(
            response.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article//text()'
            ).extract())
        print "parse_url"
        yield item
コード例 #24
0
class HwzSpider(CrawlSpider):
    name = "hwz"
    allowed_domains = ["hardwarezone.com.sg"]
    start_urls = [
        "http://forums.hardwarezone.com.sg/current-affairs-lounge-17/"
    ]
    rules = (
        # Extract links matching 'garage-sales-18/.*html' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(SgmlLinkExtractor(allow=('garage\-sales\-18/.*\.html', )), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(
            allow=('current\-affairs\-lounge\-17/.*\.html', )),
             callback='parse_item',
             follow=True),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
    )

    def insert_posts(self, posts):
        return

    """
    When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work.
    """

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        posts = hxs.select("//div[@id='posts']/div[@class='post-wrapper']")
        items = []

        for post in posts:
            item = {}
            item['author_id'] = ''.join(
                post.select(".//a[@class='bigusername']/text()").extract())
            item['url'] = response.url
            item['body'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(".//td[@class='alt1']/div/text()").extract()))
            item['title'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(
                        "//h2[@class='header-gray']/text()").extract()))
            item['date_posted'] = ''.join(
                map(lambda x: x.strip(' \t\n\r#').strip(),
                    post.select(".//td[@class='thead']/text()").extract())
            )  # todo: deal with Today and Yesterday
            # item['date_posted'] = normalizeFriendlyDate(' '.join(map(lambda x:x.strip(' \t\n\r'),post.select(".//td[@class='thead']/text()").extract()))) # todo: deal with Today and Yesterday
            items.append(item)
        # self.insert_posts(items)
        print(items)
        return items
コード例 #25
0
 def __init__(self, process_idx, book_class, *args, **kwargs):
     self.idx = int(process_idx)
     self.book_class = book_class
     if self.idx > 0:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-" +
             process_idx + "0/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-' + process_idx + r'[1-9]/'
     else:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-1/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-[1-9]/'
     self.rules = (
         Rule(
             SgmlLinkExtractor(allow=(allow_url, ),
                               restrict_xpaths=('//a[text()="%s"]' %
                                                (self.nextpage2)))),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/info/\d+/', ),
             restrict_xpaths=('//div[@class="block_img"]')),
              callback='parse_book',
              follow=False),
         Rule(SgmlLinkExtractor(allow=(r'http://m.88dushu.com/mulu/\d+/', ),
                                restrict_xpaths=('//a[text()="%s"]' %
                                                 (self.startRead))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/mulu/\d+-\d+/', ),
             restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/book/\d+-\d+/', ),
             restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
              callback='parse_content',
              follow=False),
     )
     super(ListSpider, self).__init__(*args, **kwargs)
コード例 #26
0
class QQNewsSpider(CrawlSpider):
    # 爬虫名称
    name = "tutorial"
    # 设置下载延时
    download_delay = 1
    # 允许域名
    allowed_domains = ["news.cnblogs.com"]
    # 开始URL
    start_urls = ["https://news.cnblogs.com"]
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = [
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_item'),
    ]

    # 解析内容函数
    def parse_item(self, response):
        print('***********************')
        item = QqnewsItem()

        # 当前URL
        title = response.selector.xpath(
            '//*[@id="news_title"]/a')[0].extract().decode('utf-8')
        item['title'] = title
        print(title)

        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author

        release_date = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['release_date'] = release_date

        yield item
コード例 #27
0
ファイル: cnblogs.py プロジェクト: dscdtc/python_demo_set
class CnblogsSpider(CrawlSpider):
    # 爬虫名称
    name = 'cnblogs'  # 唯一标识,启动spider时即指定该名称
    # 下载延时
    download_delay = 2
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['https://news.cnblogs.com/']
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = (
        # 下面是符合规则的网址,但是不抓取内容,只是提取该页的链接
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/page/\d', ))
             ),
        # 下面是符合规则的网址,提取内容
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse'))

    # 解析内容函数
    def parse(self, response):
        # 当前URL
        for resp in response.selector.xpath('//div[@class="content"]'):
            item = ScrapyspiderItem()

            title = resp.xpath('h2/a/text()').extract()
            item['title'] = title[0].decode('utf-8')

            url = resp.xpath('h2/a/@href').extract()
            item['url'] = 'https://news.cnblogs.com' + url[0].decode('utf-8')

            author = resp.xpath(
                'div[@class="entry_footer"]/a/text()').extract()
            item['author'] = author[0].strip().decode('utf-8')

            date = resp.xpath(
                'div[@class="entry_footer"]/span[@class="gray"]/text()'
            ).extract()
            item['date'] = date[0].decode('utf-8')

            yield item
コード例 #28
0
ファイル: Recursive.py プロジェクト: Mr-Perfection/scrap
class   RecursiveScraperSpider(CrawlSpider) :
    name = "rs"
    allowed_domains = ["cse.iitd.ernet.in"]
    start_urls = ["http://www.cse.iitd.ernet.in/~naveen"]
    rules = (
        Rule(SgmlLinkExtractor(allow=("cse\.iltd\.ernet\.in/\~naveen/.*", )), callback='parse_item', follow= True),
        )
    
    def   parse_item(self, response) :
        sel = Selector(response)
        item = RecursivescraperItem()
        item['URL'] = response.request.url
        item['content'] = sel.xpath('/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract()
        return item
コード例 #29
0
class CSDNBlogCrawlSpider(CrawlSpider):
    name = "CSDNBlogCrawlSpider"
    allowed_domains = ['blog.csdn.net']
    start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017']
    rules = [
        Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
                               restrict_xpaths=('//li[@class="next_article"]')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        item = {}
        sel = Selector(response)
        blog_url = str(response.url)
        blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['blog_name'] = [n.encode('utf-8') for n in blog_name]
        item['blog_url'] = blog_url.encode('utf-8')

        yield item
    def getImageUrl(self, item):
        imageurl = item.xpath("a[@class='img']/img/@src").extract()
        if imageurl:
            return imageurl[0]
        else:
            return ''

    def getLink(self, item):
        link = item.xpath("a[@class='img']/@href").extract()
        if link:
            return link[0]
        else:
            return ''

    def getUpdateTime(self, item):
        updatetime = item.xpath("span/text()").extract()
        if updatetime:
            return updatetime[0]
        else:
            return item.xpath("span/font/text()").extract()[0]

    def getName(self, item):
        name = item.xpath("a/strong/font/font/text()").extract()
        if name:
            return name[0]
        else:
            return item.xpath("a/strong/font/text()").extract()[0]
コード例 #30
0
ファイル: ZhilianSpider.py プロジェクト: sjl421/quick-python
class ZhilianSpider(scrapy.Spider):
    name = "zhilian"
    # allowed_domains =
    start_urls = ["http://jobs.zhaopin.com/bj2140003/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(r'http://jobs.zhaopin.com/[0-9]+.htm', )),
        callback='parse_page',
        follow=True), )

    def parse_page(self, response):
        sel = Selector(response)
        item = ZhiLianItems()
        item['name'] = sel.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()')
        item['company'] = sel.xpath(
            '/html/body/div[5]/div[1]/div[1]/h2/a/text()')
        return item
コード例 #31
0
class ArticleSpider(CrawlSpider):
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print("Title is: " + title)
        item['title'] = title
        return item
コード例 #32
0
class NoticiasSpider(CrawlSpider):
	name = 'NoticiasSpider'
	allowed_domains = ['20minutos.es']
	start_urls = ['http://www.20minutos.es/']
	rules = (
		Rule(SgmlLinkExtractor(allow=(r'deportes/noticia/(\w|\d|-|/)*/', )), callback='parse_news', follow=False),
	)

	def parse_news(self, response):
		hxs = HtmlXPathSelector(response)
		elemento = Noticia()

		elemento['titulo'] = hxs.select('//h1[contains(@class, "article-title")]/text()')[0].extract()
		elemento['titulo'] = elemento['titulo'].encode('utf-8')
		elemento['fecha'] = hxs.select('//a[contains(@title, "Noticias del ")]/text()')[0].extract()
		elemento['fecha'] = elemento['fecha'].encode('utf-8')
		elemento['enlace'] = response.url

		return elemento
コード例 #33
0
class StackSpider(CrawlSpider):
    name = "stackcrawl"
    allowed_domains = ["stackoverflow.com"]
    start_urls = [
        "http://stackoverflow.com/questions?sort=newest",
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        questions = hxs.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item
コード例 #34
0
 def parse(self,response):
     link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+'))
     for i in link_ex.extract_links(response):
         yield Request(i.url,callback=self.parse_item,headers=self.headers)