Ejemplo n.º 1
0
class PoiSpider(CrawlSpider):
    district = '110108'
    name = 'poi'
    allowed_domains = ['poi86.com']
    start_urls = ('http://www.poi86.com/poi/amap/district/' + district +
                  '/1.html', )

    rules = (
        Rule(
            SgmlLinkExtractor(
                allow=(r'http://www.poi86.com/poi/amap/district/' + district +
                       '/\d+.html'))),
        Rule(SgmlLinkExtractor(
            allow=(r'http://www.poi86.com/poi/amap/\d+.html')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = PoiItem()
        # '/html/body/div[2]/div/div[1]/h1'
        item['name'] = response.xpath(
            '/html/body/div[2]/div[1]/div[1]/h1/text()').extract()
        item['address'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract()
        item['category'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract()
        item['wgs_84'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract()
        item['gcj_02'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract()
        item['bd_09'] = response.xpath(
            '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract()
        yield item
Ejemplo n.º 2
0
class BloggerSpider(CrawlSpider):
    name = "TheHackerWay"
    start_urls = ['http://thehackerway.com']
    # urls desde las cuales el spider comenzará el proceso de crawling
    rules = [
        Rule(SgmlLinkExtractor(allow=[r'/\d{4}']),
             follow=True,
             callback='parse_blog'),
        # r'/\d+' : expression regular para http://thehackerway.com/X URLs
        Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}\d{2}/\w+']),
             callback='parse_blog')
    ]

    # http://thehackerway.com/YYYY/MM/DD/titulo URLs

    def parse_blog(self, response):
        print 'link parseado %s' % response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select(
            '//title/text()').extract()  # Selector XPath para el titulo
        item['author'] = hxs.select(
            "//span[@class='author']/a/text()").extract(
            )  # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()"
                                 ).extract()  # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract(
        )  # Selector XPath para la fecha
        return item  # Retornando el Item.
Ejemplo n.º 3
0
class AirbnbSpider(RentBaseSpider):
    name = "airbnb"
    allowed_domains = ["airbnb.com"]
    start_urls = ["https://www.airbnb.com/sitemaps"]

    rules = (
        Rule(
            SgmlLinkExtractor(restrict_xpaths=("//*[@class='sitemap']//a", ))),
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("//*[@class='next next_page']/a", ))),
        Rule(SgmlLinkExtractor(restrict_xpaths=("//*[@itemprop='name']/a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['item_source'] = response.url
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath("//*[@id='listing_name']/text()").extract()[0]

    def get_price(self, response):
        return response.xpath(
            "//*[contains(@class,'book-it__price-amount')]//text()").extract(
            )[0].strip()
Ejemplo n.º 4
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
Ejemplo n.º 5
0
class ListSpider(CrawlSpider):
    #爬虫名称
    name = "tutorial"
    #设置下载延时
    download_delay = 1
    #允许域名
    allowed_domains = ["news.cnblogs.com"]
    #开始URl
    start_urls = ["https://news.cnblogs.com"]
    #爬虫规则
    rules = (
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_content'),
    )

    #解析内容
    def parse_content(self, response):
        item = TutorialItem()
        #当前url
        title = response.selector.xpath(
            '//div[@id="news_title"]')[0].extract().decode('utf-8')
        item['title'] = title
        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author
        releasedate = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['releasedate'] = releasedate
        yield item
Ejemplo n.º 6
0
class DoubanCrawler(CrawlSpider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/top250"]
    #allowed_domains = ["fling.seas.upenn.edu/"]
    #start_urls = ["https://fling.seas.upenn.edu/~yinfeiy/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(
            r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=', ))),
             Rule(SgmlLinkExtractor(
                 allow=(r'http://movie\.douban\.com/subject/\d+', )),
                  callback='parse_page',
                  follow=True))

    def start_requests(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'
        }
        for i, url in enumerate(self.start_urls):
            yield Request(url,
                          cookies={'over18': '1'},
                          callback=self.parse_page,
                          headers=headers)

    def parse_page(self, response):
        sel = Selector(response)
        item = DoubanMovieItem()
        item['name'] = sel.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()
        item['desc'] = sel.xpath(
            '//div/span[@property="v:summary"]/text()').extract()
        item['url'] = response.url
        return item
Ejemplo n.º 7
0
class HomeawaySpider(RentBaseSpider):
    name = "homeaway"
    allowed_domains = ["homeaway.com"]
    start_urls = ['https://www.homeaway.com/search']

    rules = (
        Rule(
            SgmlLinkExtractor(
                restrict_xpaths=("(//*[@class='region-refinement'])[6]",
                                 "//*[@class='next']/a"))),
        Rule(SgmlLinkExtractor(restrict_xpaths=(
            "//*[@class='hit-content']//*[@class='hit-headline']//a", )),
             callback="parse_product"),
    )

    def parse_product(self, response):
        """Function extracting values from product page"""

        item = PlaceItem()
        item['name'] = self.get_name(response)
        item['price'] = self.get_price(response)
        yield item

    def get_name(self, response):
        return response.xpath(
            "(//*[@class='container hidden-phone']//h1/text())").extract()[0]

    def get_price(self, response):
        # price for some places is unavailable, only available on request
        price = response.xpath("(//*[@class='price-large']/text())").extract()
        not_available_message = "Available on Inquiry"
        return price[0] if price else not_available_message
 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor(deny_extensions="jpg")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])
Ejemplo n.º 9
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
 def test_attrs_sgml(self):
     html = """<html><area href="sample1.html"></area>
     <a ref="sample2.html">sample text 2</a></html>"""
     response = HtmlResponse("http://example.com/index.html", body=html)
     lx = SgmlLinkExtractor(attrs="href")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/sample1.html', text=u''),
     ])
Ejemplo n.º 11
0
 def __init__(self, selector=None, type='css', *args, **kwargs):
     if selector:
         if type not in ['css', 'xpath']:
             raise Exception('Selector type not supported.')
         if type == 'xpath':
             kwargs['restrict_xpaths'] = selector
         else:
             kwargs['restrict_xpaths'] = pyquery.PyQuery('a')._css_to_xpath(selector)
     SgmlLinkExtractor.__init__(self, *args, **kwargs)
Ejemplo n.º 12
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])
Ejemplo n.º 13
0
class homespider(CrawlSpider):
    name = 'home'

    allow_domains = ['qd.fang.lianjia.com']
    start_urls = []
    for i in range(1, 48):
        start_urls.append('http://qd.fang.lianjia.com/loupan/pg' + str(i))

    rules = (
        Rule(
            SgmlLinkExtractor(allow=('loupan/p_\w+', ),
                              restrict_xpaths="//div[@class = 'con-box']")),
        Rule(SgmlLinkExtractor(allow=('loupan/p_\w+/xiangqing/')),
             callback='parse_item'),
    )

    def parse_item(self, response):
        torrent = Home_item()
        #=======================================================================
        # deny = ('loupan/p_\w+/xiangce.*', 'loupan/p_\w+/dongtai.*', 'loupan/p_\w+/pinglun.*','loupan/p_\w+/huxingtu.*','loupan/p_\w+/tuijian.*','loupan/p_\w+/peitao.*','loupan/p_\w+/%.*','loupan/p_\w+/xiangqing/%.*','loupan/p_\w+/xiangqing/.+')
        # torrent['name'] = response.xpath("//div[@class = 'col-1']/h2/a/text()").extract()
        # torrent['address'] = response.xpath("//span[@class = 'region']/text()").extract()
        # torrent['price'] = response.xpath("//span[@class = 'num']/text()").extract()
        # torrent['area'] = response.xpath("//div[@class = 'area']/text()").extract()
        # torrent['square'] = response.xpath("//div[@class = 'area']/span/text()").extract()
        #=======================================================================

        torrent['name'] = response.css("div.resb-name::text").extract()
        torrent['price'] = response.css(
            "ul.x-box span.label-val span::text").extract()
        torrent['where'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li/span[@class = 'label-val']/a/text()"
        ).extract()
        torrent['address'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[5]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['sellor'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[1]/li[7]/span[@class = 'label-val']/text()"
        ).extract()
        torrent['opentime'] = response.css("span.fq-open span::text").extract()
        torrent['gettime'] = response.css(
            "span.fq-handover span::text").extract()
        torrent['alltime'] = response.xpath(
            "//div[@class = 'big-left fl']/ul[3]/li[8]/span[@class = 'label-val']/text()"
        ).extract()

        #  torrent['name'] = response.css("a.clear h1::text").extract()
        #  torrent['address'] = response.css("span.region::text").extract()
        #   torrent['price'] = response.css("p.jiage span.junjia::text").extract()
        # torrent['area'] = response.css("div.area::text").extract()
        #  torrent['square'] = response.css("div.area span::text").extract()
        return torrent
Ejemplo n.º 14
0
class MantaSpider(CrawlSpider):
    name = 'manta'
    allowed_domains = ['manta.com']

    rules = (Rule(SgmlLinkExtractor(allow=r'Items/'),
                  callback='parse_item',
                  follow=True),
             Rule(SgmlLinkExtractor(allow=r"/c/[^/]*/[^/]*$"),
                  callback='parse_company_detail',
                  follow=True))

    def __init__(self, term=None, *args, **kwargs):
        super(MantaSpider, self).__init__(*args, **kwargs)
        if term:
            self.start_urls = ['http://www.manta.com/mb?search=%s' % term]
        else:
            self.start_urls = ['http://www.manta.com/']

    def parse_start_url(self, response):
        return self.parse_company(response)

    def parse_search_result(self, response):
        hxs = HtmlXPathSelector(response)
        elems = hxs.select('//a[contains(@class, "nextYes")]/@href').extract()
        if len(elems) >= 1:
            yield Requeset(elems[0], callback=self.parse_company)

    def parse_company(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for h in hxs.select('//div[contains(@class, "pbl")]'):
            c = Company()
            c['name'] = h.select('*/h2[@itemprop="name"]/a/text()').extract()
            c['manta_url'] = h.select(
                '*/h2[@itemprop="name"]/a/@href').extract()
            c['street'] = h.select(
                '*/div[@itemprop="streetAddress"]/text()').extract()
            c['locality'] = h.select(
                '*/div[@itemprop="addressLocality"]/text()').extract()
            c['region'] = h.select(
                '*/div[@itemprop="addressRegion"]/text()').extract()
            c['postal_code'] = h.select(
                '*/div[@itemprop="postalCode"]/text()').extract()
            c['phone'] = h.select(
                '*/div[@itemprop="telephone"]/text()').extract()
            c['website'] = h.select('*/div[@itemprop="url"]/text()').extract()
            yield c

    def parse_company_detail(self, response):
        print(response)
Ejemplo n.º 15
0
 def _init_args(self, **kwargs):
     start_url = kwargs.get('START_URL', '')
     if start_url:
         self.start_urls = [start_url]
     self.rules = (Rule(SgmlLinkExtractor(allow=filter_rules),
                        callback="parse_resp",
                        follow=True,
                        process_links=self.put_links), )
     self.headers = {
         'Host': 'cn.futureelectronics.com',
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36',
         'Referer': 'http://cn.futureelectronics.com/zh/Pages/index.aspx'
     }
     self.cookies = {
         'SelectedCurrency': 'NY',
         'SelectedLanguage': 'zh-CN',
     }
     # 商品搜索
     self.product_url_pattern_0 = re.compile(filter_rules[0], re.IGNORECASE)
     # 判断是否是商品详情url
     self.product_url_pattern_1 = re.compile(filter_rules[1], re.IGNORECASE)
     self.product_url_pattern_2 = re.compile(filter_rules[2], re.IGNORECASE)
     # 从商品详情url中获取 product_id 作为 goods_sn
     self.product_id_pattern_1 = re.compile(r'ProductID=([^&]+)',
                                            re.IGNORECASE)
     self.product_id_pattern_2 = re.compile(r'/Pages/(.*)\.aspx',
                                            re.IGNORECASE)
     # 每一页的商品数量
     self.limit_num = 10.0
Ejemplo n.º 16
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     <a href="http://google.com/something" rel="external nofollow">Something</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(
         [link for link in lx.extract_links(response)],
         [
             Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True),
             Link(url="http://example.org/about.html", text=u"About us", nofollow=False),
             Link(url="http://google.com/something", text=u"Something", nofollow=True),
         ],
     )
Ejemplo n.º 17
0
    def __init__(self, book_key, ct, *args, **kwargs):
        self.book_key = book_key
        self.ct = ct
        self.start_urls = ["http://m.88dushu.com/mulu/" + book_key + "-1/"]
        self.rules = (
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/mulu/' + book_key + '-\d+/', ),
                restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
                 follow=True),
            Rule(SgmlLinkExtractor(
                allow=(r'http://m.88dushu.com/book/' + book_key + '-\d+/', ),
                restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
                 callback='parse_content',
                 follow=False),
        )

        super(ChapterSpider, self).__init__(*args, **kwargs)
Ejemplo n.º 18
0
class MySpider(CrawlSpider):
    name = "stimdi"
    allowed_domains = ["stimdi.se"]
    start_urls = ["http://www.stimdi.se/tidslinjen/"]

    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="content"]/div/div/h2/a')),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        i = 0
        print i
        for div in response.xpath('//*[@id="content"]/div/div'):
            print "IN FOR"
            item = AfeventItem()
            #Store data into lists
            item['title'] = div.xpath('//h2/a/text()').extract()[i]
            item['url'] = div.xpath('//h2/a/@href').extract()[i]
            item['location'] = ''
            item['description'] = div.xpath(
                '//*[@id="content"]/div/div[1]/a[1]/p/text()').extract()[i]

            #The following code changes the format of the date
            origDate = div.xpath('//p/text()').extract()[i]
            newDate = ''.join(origDate).replace(',', '').split()

            #Assign values to month names
            month = [
                "", "januari", "februari", "mars", "april", "maj", "juni",
                "juli", "augusti", "september", "oktober", "november",
                "december"
            ].index(newDate[1])
            #Assign a "0" in the beginning if month number is < 10
            if month < 10:
                zeroMonth = [0, month]
                zeroMonth = ''.join(map(str, zeroMonth))
            else:
                zeroMonth = month

    #same thing as above with day
            if int(newDate[0]) < 10:
                zeroDate = [0, newDate[0]]
                zeroDate = ''.join(map(str, zeroDate))
            else:
                zeroDate = newDate[0]

    #Puts everything together and stores into item['date']
            finalDate = [newDate[2], zeroMonth, zeroDate]
            item['date'] = '-'.join(finalDate)
            print i

            if i < len(response.xpath('//*[@id="content"]/div/div')):
                print "I IF"
                print len(response.xpath('//*[@id="content"]/div/div'))
                i = i + 1

            yield item
Ejemplo n.º 19
0
class etaoSpider(CrawlSpider):
    # name of spiders
    name = 'Spider'
    allow_domain = ['gouwu.sogou.com']
    start_urls = [('http://gouwu.sogou.com/shop?query=' + searchWord)
                  for searchWord in lstData().lst]
    link_extractor = {
        'page': SgmlLinkExtractor(allow='/detail/\d+\.html.+'),
        'page_down': SgmlLinkExtractor(
            allow='/shop\?query=.+',
        ),  #restrict_xpaths = '//a[@class = "pagination-next"]'
    }
    _x_query = {
        'title': '//p[@class="title"]/a/@title',
        'name':
        '//span[@class="floatR hui61 mt1"]/text()',  #//li[2]/a/div[@class="ruyitao-market-name ruyitao-market-name-hightlight"]/text()
        'price':
        '//span[@class="shopprice font17"]/text()',  # 'price'    :    '//span[@class = "price"]/text()',
    }

    def __init__(self):
        CrawlSpider.__init__(self)
        # use any browser you wish
        self.browser = webdriver.Firefox()

    def __del__(self):
        self.browser.close()

    def parse(self, response):
        #crawl all display page
        for link in self.link_extractor['page_down'].extract_links(response):
            yield Request(url=link.url, callback=self.parse)
        #start browser
        self.browser.get(response.url)
        #loading time interval
        time.sleep(5)
        # get the data and write it to scrapy items
        etaoItem_loader = ItemLoader(item=EtaoItem(), response=response)
        url = str(response.url)
        etaoItem_loader.add_value('url', url)
        etaoItem_loader.add_xpath('title', self._x_query['title'])
        etaoItem_loader.add_xpath('name', self._x_query['name'])
        etaoItem_loader.add_xpath('price', self._x_query['price'])
        yield etaoItem_loader.load_item()
Ejemplo n.º 20
0
class PiaohuaCrawlSpider(CrawlSpider):
    name = "PiaohuaCrawlSpider"
    allowed_domains = ['piaohua.com']
    start_urls = [
        'http://piaohua.com/html/aiqing/index.html',
        'http://piaohua.com/html/kehuan/index.html',
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('list_'),
                               restrict_xpaths=("//div[@class='page']/a")),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        items = []
        sel = Selector(response)
        movie_list = sel.xpath("//div[@id='nml']//dl")
        for movie in movie_list:
            item = PiaohuaItem()
            item['linkurl'] = self.getLinkUrl(movie)
            item['name'] = self.getName(movie)
            item['imageurl'] = self.getImageUrl(movie)
            item['type'] = self.getType(response)

            movieDetail = self.getMovieDetail(item['linkurl'])
            # item['downloadlink'] = self.getDownloadLink(movieDetail)
            # item['updatetime'] = self.getUpdateTime(movieDetail)
            items.append(item)
        return items

    def getLinkUrl(self, site):
        return site.xpath("dt/a/@href").extract()[0]

    def getImageUrl(self, site):
        return site.xpath("dt//img/@src").extract()[0]

    def getName(self, site):
        return site.xpath("dd/strong/a/b/font/text()").extract()[0]

    def getType(self, response):
        return response.url.split('/')[-2]

    def getUpdateTime(self, site):
        str = site.xpath(
            "//div[@id='show']/div[@id='showdesc']/text()").extract()[0]
        return re.search(r'.*(\d{4}-\d{2}-\d{2}).*', str).group(1)

    def getDownloadLink(self, site):
        return site.xpath("//anchor/a/text()").extract()

    def getMovieDetail(self, url):
        url = 'http://piaohua.com' + url
        return Selector(Request(url=url))
Ejemplo n.º 21
0
 def test_restrict_xpaths_with_html_entities(self):
     html = '<html><body><p><a href="/&hearts;/you?c=&euro;">text</a></p></body></html>'
     response = HtmlResponse("http://example.org/somepage/index.html",
                             body=html,
                             encoding='iso8859-15')
     links = SgmlLinkExtractor(
         restrict_xpaths='//p').extract_links(response)
     self.assertEqual(links, [
         Link(url='http://example.org/%E2%99%A5/you?c=%E2%82%AC',
              text=u'text')
     ])
Ejemplo n.º 22
0
class bbcSpider(CrawlSpider):
    name = "bbc"

    start_urls = ["http://www.bbc.com"]
    download_delay = 2
    handle_httpstatus_list = [301]

    rules = [
        Rule(
            SgmlLinkExtractor(
                allow=(r"http://www.bbc.com/.*?"),
                #  deny=("http:\/\/.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe).*",
                #          "http:\/\/.*#.*",
                #         "https:\/\/www\.bbc\.com\/w\/index\.php\?.*type=signup.*",
                #        "https:\/\/www\.bbc\.com\/w\/index\.php\?.*action=.*",
                #       "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Talk:.*",
                #           "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Category:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special:.*",
                #          "https:\/\/www\.bbc\.com\/sport.*",
                #          "https:\/\/www\.bbc\.com\/weather.*",
                #          "http:\/\/www\.bbc\.com\/earth.*",
                #          "http:\/\/www.bbc.com\/travel.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special%3AUserLogin.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Special.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User_talk:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=User:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template:.*",
                #          "https:\/\/www\.bbc\.com\/w\/index\.php\?.*title=Template_talk:.*"
                # ),
                allow_domains=("www.bbc.com")),
            callback='parse_item',
            follow='true')
    ]

    def parse_item(self, response):
        item = BbcItem()
        title_tmp = response.xpath(
            '//*[@id="page"]//h1//text()').extract_first()
        title = title_tmp
        if title:
            title = title.encode('utf8')
        item['title'] = title
        content_tmp = response.xpath(
            '//*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//p//text() | //*[@id="page"]/div[2]/div[2]/div/div[1]/div[1]//h2//text()'
        ).extract()
        content = ''
        for con in content_tmp:
            if con[-1] == '.':
                con = con + ' '
            content = content + con.encode('utf-8')
        item['content'] = content
        link = str(response.url)
        item['url'] = link.encode('utf-8')
        return item
Ejemplo n.º 23
0
class MySpider(CrawlSpider):
    name = "af"
    allowed_domains = ["afconsult.com"]
    start_urls = [
        "http://www.afconsult.com/sv/jobba-hos-oss/event-seminarier--massor/"
    ]
    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//*[@id="CalendarContainer"]/div')),
                  callback="parser",
                  follow=True), )

    def parser(self, response):
        i = 0
        for div in response.xpath('//*[@id="CalendarContainer"]/div/div/a'):
            item = AfeventItem()
            print "response.xpath"
            item['title'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/h1/text()'
            ).extract()[i]
            item['venue'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/text()'
            ).extract()[i]
            item['date'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/span/text()'
            ).extract()[i]
            item['time'] = div.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[2]/span[2]/span/text()|//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article/p[3]/span/text()[3]'
            ).extract()[i]
            item['url'] = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url_1 = div.xpath(
                '//*[@id="mainContent"]/section/div/div/nav/ul/li[4]/a/@href'
            ).extract()[i]
            follow_url = 'http://www.afconsult.com' + follow_url_1
            request = Request(follow_url, callback=self.parse_url)
            request.meta['item'] = item

            if i < len(
                    response.xpath('//*[@id="CalendarContainer"]/div/div/a')):
                i = i + 1
                print i
            yield request

    def parse_url(self, response):
        item = response.meta['item']
        item['description'] = ''.join(
            response.xpath(
                '//*[@id="mainContent"]/main/section[3]/div[1]/div[1]/article//text()'
            ).extract())
        print "parse_url"
        yield item
Ejemplo n.º 24
0
class HwzSpider(CrawlSpider):
    name = "hwz"
    allowed_domains = ["hardwarezone.com.sg"]
    start_urls = [
        "http://forums.hardwarezone.com.sg/current-affairs-lounge-17/"
    ]
    rules = (
        # Extract links matching 'garage-sales-18/.*html' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(SgmlLinkExtractor(allow=('garage\-sales\-18/.*\.html', )), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(
            allow=('current\-affairs\-lounge\-17/.*\.html', )),
             callback='parse_item',
             follow=True),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
    )

    def insert_posts(self, posts):
        return

    """
    When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work.
    """

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        posts = hxs.select("//div[@id='posts']/div[@class='post-wrapper']")
        items = []

        for post in posts:
            item = {}
            item['author_id'] = ''.join(
                post.select(".//a[@class='bigusername']/text()").extract())
            item['url'] = response.url
            item['body'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(".//td[@class='alt1']/div/text()").extract()))
            item['title'] = '\n'.join(
                map(lambda x: x.strip('\t\n\r'),
                    post.select(
                        "//h2[@class='header-gray']/text()").extract()))
            item['date_posted'] = ''.join(
                map(lambda x: x.strip(' \t\n\r#').strip(),
                    post.select(".//td[@class='thead']/text()").extract())
            )  # todo: deal with Today and Yesterday
            # item['date_posted'] = normalizeFriendlyDate(' '.join(map(lambda x:x.strip(' \t\n\r'),post.select(".//td[@class='thead']/text()").extract()))) # todo: deal with Today and Yesterday
            items.append(item)
        # self.insert_posts(items)
        print(items)
        return items
Ejemplo n.º 25
0
 def __init__(self, process_idx, book_class, *args, **kwargs):
     self.idx = int(process_idx)
     self.book_class = book_class
     if self.idx > 0:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-" +
             process_idx + "0/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-' + process_idx + r'[1-9]/'
     else:
         self.start_urls = [
             "http://m.88dushu.com/wapsort/" + book_class + "-1/"
         ]
         allow_url = r'http://m.88dushu.com/wapsort/' + book_class + r'-[1-9]/'
     self.rules = (
         Rule(
             SgmlLinkExtractor(allow=(allow_url, ),
                               restrict_xpaths=('//a[text()="%s"]' %
                                                (self.nextpage2)))),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/info/\d+/', ),
             restrict_xpaths=('//div[@class="block_img"]')),
              callback='parse_book',
              follow=False),
         Rule(SgmlLinkExtractor(allow=(r'http://m.88dushu.com/mulu/\d+/', ),
                                restrict_xpaths=('//a[text()="%s"]' %
                                                 (self.startRead))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/mulu/\d+-\d+/', ),
             restrict_xpaths=('//a[text()="%s"]' % (self.nextpage))),
              follow=True),
         Rule(SgmlLinkExtractor(
             allow=(r'http://m.88dushu.com/book/\d+-\d+/', ),
             restrict_xpaths=('//ul[@class="chapter" and not(@id)]')),
              callback='parse_content',
              follow=False),
     )
     super(ListSpider, self).__init__(*args, **kwargs)
Ejemplo n.º 26
0
class QQNewsSpider(CrawlSpider):
    # 爬虫名称
    name = "tutorial"
    # 设置下载延时
    download_delay = 1
    # 允许域名
    allowed_domains = ["news.cnblogs.com"]
    # 开始URL
    start_urls = ["https://news.cnblogs.com"]
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = [
        Rule(SgmlLinkExtractor(
            allow=(r'https://news.cnblogs.com/n/page/\d', ))),
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse_item'),
    ]

    # 解析内容函数
    def parse_item(self, response):
        print('***********************')
        item = QqnewsItem()

        # 当前URL
        title = response.selector.xpath(
            '//*[@id="news_title"]/a')[0].extract().decode('utf-8')
        item['title'] = title
        print(title)

        author = response.selector.xpath('//div[@id="news_info"]/span/a/text()'
                                         )[0].extract().decode('utf-8')
        item['author'] = author

        release_date = response.selector.xpath(
            '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract(
            ).decode('utf-8')
        item['release_date'] = release_date

        yield item
Ejemplo n.º 27
0
class CnblogsSpider(CrawlSpider):
    # 爬虫名称
    name = 'cnblogs'  # 唯一标识,启动spider时即指定该名称
    # 下载延时
    download_delay = 2
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['https://news.cnblogs.com/']
    # 爬取规则,不带callback表示向该类url递归爬取
    rules = (
        # 下面是符合规则的网址,但是不抓取内容,只是提取该页的链接
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/page/\d', ))
             ),
        # 下面是符合规则的网址,提取内容
        Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )),
             callback='parse'))

    # 解析内容函数
    def parse(self, response):
        # 当前URL
        for resp in response.selector.xpath('//div[@class="content"]'):
            item = ScrapyspiderItem()

            title = resp.xpath('h2/a/text()').extract()
            item['title'] = title[0].decode('utf-8')

            url = resp.xpath('h2/a/@href').extract()
            item['url'] = 'https://news.cnblogs.com' + url[0].decode('utf-8')

            author = resp.xpath(
                'div[@class="entry_footer"]/a/text()').extract()
            item['author'] = author[0].strip().decode('utf-8')

            date = resp.xpath(
                'div[@class="entry_footer"]/span[@class="gray"]/text()'
            ).extract()
            item['date'] = date[0].decode('utf-8')

            yield item
Ejemplo n.º 28
0
class   RecursiveScraperSpider(CrawlSpider) :
    name = "rs"
    allowed_domains = ["cse.iitd.ernet.in"]
    start_urls = ["http://www.cse.iitd.ernet.in/~naveen"]
    rules = (
        Rule(SgmlLinkExtractor(allow=("cse\.iltd\.ernet\.in/\~naveen/.*", )), callback='parse_item', follow= True),
        )
    
    def   parse_item(self, response) :
        sel = Selector(response)
        item = RecursivescraperItem()
        item['URL'] = response.request.url
        item['content'] = sel.xpath('/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract()
        return item
Ejemplo n.º 29
0
class CSDNBlogCrawlSpider(CrawlSpider):
    name = "CSDNBlogCrawlSpider"
    allowed_domains = ['blog.csdn.net']
    start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017']
    rules = [
        Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
                               restrict_xpaths=('//li[@class="next_article"]')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        # print "parse_item>>>>>>"
        item = {}
        sel = Selector(response)
        blog_url = str(response.url)
        blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()

        item['blog_name'] = [n.encode('utf-8') for n in blog_name]
        item['blog_url'] = blog_url.encode('utf-8')

        yield item
    def getImageUrl(self, item):
        imageurl = item.xpath("a[@class='img']/img/@src").extract()
        if imageurl:
            return imageurl[0]
        else:
            return ''

    def getLink(self, item):
        link = item.xpath("a[@class='img']/@href").extract()
        if link:
            return link[0]
        else:
            return ''

    def getUpdateTime(self, item):
        updatetime = item.xpath("span/text()").extract()
        if updatetime:
            return updatetime[0]
        else:
            return item.xpath("span/font/text()").extract()[0]

    def getName(self, item):
        name = item.xpath("a/strong/font/font/text()").extract()
        if name:
            return name[0]
        else:
            return item.xpath("a/strong/font/text()").extract()[0]
Ejemplo n.º 30
0
class ZhilianSpider(scrapy.Spider):
    name = "zhilian"
    # allowed_domains =
    start_urls = ["http://jobs.zhaopin.com/bj2140003/"]

    rules = (Rule(
        SgmlLinkExtractor(allow=(r'http://jobs.zhaopin.com/[0-9]+.htm', )),
        callback='parse_page',
        follow=True), )

    def parse_page(self, response):
        sel = Selector(response)
        item = ZhiLianItems()
        item['name'] = sel.xpath('/html/body/div[5]/div[1]/div[1]/h1/text()')
        item['company'] = sel.xpath(
            '/html/body/div[5]/div[1]/div[1]/h2/a/text()')
        return item
Ejemplo n.º 31
0
class ArticleSpider(CrawlSpider):
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print("Title is: " + title)
        item['title'] = title
        return item
Ejemplo n.º 32
0
class NoticiasSpider(CrawlSpider):
	name = 'NoticiasSpider'
	allowed_domains = ['20minutos.es']
	start_urls = ['http://www.20minutos.es/']
	rules = (
		Rule(SgmlLinkExtractor(allow=(r'deportes/noticia/(\w|\d|-|/)*/', )), callback='parse_news', follow=False),
	)

	def parse_news(self, response):
		hxs = HtmlXPathSelector(response)
		elemento = Noticia()

		elemento['titulo'] = hxs.select('//h1[contains(@class, "article-title")]/text()')[0].extract()
		elemento['titulo'] = elemento['titulo'].encode('utf-8')
		elemento['fecha'] = hxs.select('//a[contains(@title, "Noticias del ")]/text()')[0].extract()
		elemento['fecha'] = elemento['fecha'].encode('utf-8')
		elemento['enlace'] = response.url

		return elemento
Ejemplo n.º 33
0
class StackSpider(CrawlSpider):
    name = "stackcrawl"
    allowed_domains = ["stackoverflow.com"]
    start_urls = [
        "http://stackoverflow.com/questions?sort=newest",
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        questions = hxs.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item
Ejemplo n.º 34
0
 def parse(self,response):
     link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+'))
     for i in link_ex.extract_links(response):
         yield Request(i.url,callback=self.parse_item,headers=self.headers)