Example #1
0
    def __init__(self, url=None):
        if url:
            # retrieve with post method, put for create, get for read, delete for delete
            # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=douban
            unirest.timeout(180)
            req = unirest.post(url, headers={"Accept":"application/json"})
            self.start_urls = [data['url'] for data in req.body['data']]
            self.name = url[url.find('spider=')+7:]

            self.visitedurldict = OrderedDict()
            self.datadict       = OrderedDict()
            self.filedict       = OrderedDict()
            self.deadurldict    = OrderedDict()

            self.visitedurldict['urls'] = []
            self.datadict['datas']      = []
            self.filedict['files']      = []
            self.deadurldict['urls']    = []

            rules = (
                Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True),
                Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True),
            )
        # def __del__(self) work
        dispatcher.connect(self.spider_closed, signals.spider_closed)
class HrtencentSpider(CrawlSpider):
    name = "hrtencent"
    allowed_domains = ["tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10)
    ]
    rules = [
        Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'),
        Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1')
    ]

    def parse_2(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('.tablelist')
        for site in sites:
            item = PositionDetailItem()
            item['sharetitle'] = site.css('.h #sharetitle::text').extract()
            item['bottomline'] = site.css('.bottomline td::text').extract()
            # item['duty'] = site.css('.c .l2::text').extract()
            item['link'] = response.url
            items.append(item)
            print repr(item).decode("unicode-escape") + '\n'
        # info('parsed ' + str(response))
        self.parse_1(response)
        return items

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request
Example #3
0
class DoubanBookSpider(CrawlSpider):
	name = "doubanbook"
	allowed_domains = ["douban.com"]
	start_urls = [
		"https://book.douban.com/tag"
	]
	rules = [
		Rule(sle(allow=("/subject/\d+$")), callback="parse_2"),
		Rule(sle(allow=("/tag/[^/]+$", )), follow=True)
	]

	def parse_2(self, response):
		items = []
		sel = Selector(response)
		# #wrapper means <div id='wrapper'>
		sites = sel.css('#wrapper')
		for site in sites:
			item = DoubanSubjectItem()
			item['title'] = site.css("h1 span::text").extract()
			item['link'] = response.url 
			item['content_intro'] = site.css('#link-report .intro p::text').extract()
			items.append(item)

			print item
		return items

	def process_request(self, request):
		return request
Example #4
0
    def __init__(self, url=None):

        #print "here i am"
        if url:
            # retrieve with post method, put for create, get for read, delete for delete
            # unvisitedurls http://localhost:5000/unvisitedurls?start=0&offset=10&spider=6w
            unirest.timeout(180)
            req = unirest.post(url, headers={"Accept":"application/json"})
            self.start_urls = [data['url'] for data in req.body['data']]
            self.name = url[url.find('spider=')+7:]

            self.visitedurldict = OrderedDict()
            self.datadict       = OrderedDict()
            self.filedict       = OrderedDict()
            self.deadurldict    = OrderedDict()

            self.visitedurldict['urls'] = []
            self.datadict['datas']      = []
            self.filedict['files']      = []
            self.deadurldict['urls']    = []

            rules = (
                Rule(sle(allow=("http://book.douban.com/isbn/\d+$")), callback="parse", follow=True),
                Rule(sle(allow=("http://book.douban.com/subject/\d+$")), callback="parse", follow=True),
            )
        # def __del__(self) work
        dispatcher.connect(self.spider_closed, signals.spider_closed)
Example #5
0
class alexaCNSpider(CrawlSpider):
    name = "alexa.cn"
    allowed_domains = ["alexa.com"]
    start_urls = [
        "http://www.alexa.com/",
        "http://www.alexa.com/topsites/category/World/Chinese_Simplified_CN",
    ]
    rules = [
        Rule(sle(allow=(
            "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")),
             callback='parse_category_top_xxx',
             follow=True),
        Rule(sle(allow=("/topsites/category/World/Chinese_Simplified_CN$", )),
             callback='parse_category_top_xxx',
             follow=True),
        #Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people', follow=True),
    ]

    # www.alexa.com/topsites/category/Top/Computers
    # www.alexa.com/topsites/category;1/Top/Computers
    def parse_category_top_xxx(self, response):
        info('parsed ' + str(response))
        items = []
        sel = Selector(response)

        sites = sel.css('.site-listing')
        for site in sites:
            item = alexaSiteInfoItem()
            item['url'] = site.css(
                'a[href*=siteinfo]::attr(href)')[0].extract()
            item['name'] = site.css('a[href*=siteinfo]::text')[0].extract()
            item['description'] = site.css('.description::text')[0].extract()
            remainder = site.css('.remainder::text')
            if remainder:
                item['description'] += remainder[0].extract()
            # more specific
            item['category'] = urllib.unquote('/'.join(
                response.url.split('/')[-3:])).decode('utf-8')
            items.append(item)
        return items

    def parse_category_top(self, response):
        info('parsed ' + str(response))
        items = []
        sel = Selector(response)

        categories = sel.css('li a[href*="/topsites/category/Top/"]')
        for category in categories:
            item = alexaCategoryItem()
            item['url'] = category.css('::attr(href)')[0].extract()
            item['name'] = category.css('::text')[0].extract()
            items.append(item)
        return items
class hacker_newsSpider(CommonSpider):
    name = "hacker_news"
    allowed_domains = ["news.ycombinator.com"]
    start_urls = [
        "https://news.ycombinator.com/",
    ]
    rules = [
        Rule(sle(allow=("https://news.ycombinator.com/$")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        'title': '.storylink::text',
        'desc': '.subtext .score::text',
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
class twitchSpider(CommonSpider):
    name = "twitch"
    allowed_domains = ["twitch.tv"]
    start_urls = [
        "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft"
    ]
    rules = [
        Rule(sle(allow=(
            "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft"
        )),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        '.content': {
            'room_name': '.meta .title a::text',
            'author': '.meta .info a::text',
            'people_count': '.meta .info a::attr(data-ember-action)'
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        import pdb
        pdb.set_trace()
        print(json.dumps(x, ensure_ascii=False, indent=2))
Example #8
0
class v2exSpider(CommonSpider):
    name = "v2ex"
    allowed_domains = ["v2ex.com"]
    start_urls = [
        "http://www.v2ex.com/",
    ]
    rules = [
        Rule(sle(allow=("http://www.v2ex.com/$")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        '.cell.item': {
            'title': '.item_title a::text',
            'node': '.node::text',
            'author': '.node+ strong a::text',
            'reply_count': '.count_livid::text'
        }
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        # import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
Example #9
0
class samairSpider(CommonSpider):
    name = "samair"
    allowed_domains = ["samair.ru"]
    start_urls = ['http://www.samair.ru/proxy/']
    rules = [
        Rule(sle(allow=("proxy/$")), callback='parse_1', follow=True),
    ]

    list_css_rules = {
        '#proxylist tr': {
            'ip':
            "td:nth-child(1) *::text",  #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"],
            #'port': 'td:nth-child(3) a::text',
            'anonymity': 'td:nth-child(2) *::text',
            'last_checked': 'td:nth-child(3) *::text',
            'country': 'td:nth-child(4) a::text',
        }
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        items = []
        n = response.css('tbody tr')
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict, True)
        x = x[0]['#proxylist tr']
        pp.pprint(x)
Example #10
0
class HouseSpider(CrawlSpider):
    name = 'house'
    allowed_domains = ['hz.fang.lianjia.com']
    start_urls = ['http://hz.fang.lianjia.com/loupan']

    rules = (Rule(sle(allow=("/pg\d{0,4}")),
                  follow=False,
                  callback='parse_item'), )

    def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        houses = sel.xpath('//div[@class="resblock-desc-wrapper"]')
        for house in houses:
            item = HouseItem()
            house_name = house.xpath(
                'div[@class="resblock-name"]/a/text()').extract()
            house_address = house.xpath(
                'div[@class="resblock-location"]/a/text()').extract()
            house_price = house.xpath(
                'div[@class="resblock-price"]/div[@class="main-price"]/span/text()'
            ).extract()
            house_url = house.xpath(
                'div[@class="resblock-name"]/a/@href').extract()
            url = base_url + '/' + ''.join(house_url).split('/')[2]

            item['house_name'] = house_name
            item['house_address'] = house_address
            item['house_price'] = house_price[0] + house_price[1].strip()
            item['house_url'] = url
            items.append(item)
        return items
class CnbetaSpider(CrawlSpider):
    name = 'cnbeta'
    allowed_domains = ['cnbeta.com']
    start_urls = ['http://www.cnbeta.com/']

    rules = [
        Rule(sle(allow=("/articles/.*\.htm")),
             callback='parse_cnbeta',
             follow=True),
    ]

    def parse_cnbeta(self, response):
        logger.debug('parse_cnbeta: ' + response.url)
        articlelist = Selector(response).xpath(
            '//div[@class="cnbeta-article"]')
        items = []
        for article in articlelist:
            item = CnbetaItem()
            item['title'] = article.xpath(
                'header[@class="title"]/h1/text()').extract_first()
            item['publishtime'] = article.xpath(
                'header[@class="title"]/div[@class="meta"]/span/text()'
            ).extract_first()

            logger.info('function: parse_cnbeta, url: ' + response.url +
                        ' , item: ' + str(item))
            items.append(item)
        return items
Example #12
0
class pandatvSpider(CommonSpider):
    name = "pandatv"
    allowed_domains = ["panda.tv"]
    start_urls = [
        "http://www.panda.tv/all",
    ]
    rules = [
        Rule(sle(allow=("http://www.panda.tv/all")), callback='parse_1', follow=True),
    ]

    list_css_rules = { 
        '.video-list-item.video-no-tag': {
            'room_name': '.video-title::text',
            'author': '.video-nickname::text',
            'people_count': '.video-number::text',
            'tag': '.video-cate::text',
        }   
    }   

    content_css_rules = { 
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse '+response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
Example #13
0
class github_trendingSpider(CommonSpider):
    name = "github_trending"
    allowed_domains = ["github.com"]
    start_urls = [
        "http://www.github.com/trending",
    ]
    rules = [
        Rule(sle(allow=("/trending$")), callback='parse_1', follow=True),
    ]

    list_css_rules = {
        '.repo-list-item': {
            'repo_name': '.repo-list-name a::attr(href)',
            'repo_meta': '.repo-list-meta::text',
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
class amazonbookSpider(CommonSpider):
    name = "amazonbook"
    allowed_domains = ["amazon.com", "www.amazon.com"]
    start_urls = [
        #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155",
        "http://www.amazon.com/books-used-books-textbooks/b?node=283155",
    ]
    rules = [
        #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True),
        Rule(sle(allow=("/books-used-books-textbooks/.*")),
             callback='parse_0',
             follow=True),
    ]

    css_rules = {
        ".inner .a-row": {
            "url": ".title::attr(href)",
            #"desc": "span::text"
            "title": ".s9TitleText::text",
            "comments": ".a-icon-row .a-size-small::text",
        }
    }

    def parse_0(self, response):
        info('Parse 0 ' + response.url)
        pp.pprint(self.parse_with_rules(response, self.css_rules, dict))

    #.inner .a-row
    def parse_1(self, response):
        info('Parse 1 ' + response.url)
Example #15
0
class youtube_trendingSpider(CommonSpider):
    name = "youtube_trending"
    allowed_domains = ["youtube.com"]
    start_urls = [
        "https://www.youtube.com/feed/trending",
    ]
    rules = [
        Rule(sle(allow=("feed/trending$")), callback='parse_1', follow=True),
    ]

    list_css_rules = {
        '.yt-lockup-content': {
            'video_title': '.yt-lockup-title a::text',
            'author': '.yt-lockup-byline a::text',
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
Example #16
0
class alexa_topsitesSpider(CommonSpider):
    name = "alexa_topsites"
    allowed_domains = ["alexa.com"]
    start_urls = [
        "http://www.alexa.com/topsites",
    ]
    rules = [
        Rule(sle(allow=("http://www.alexa.com/topsites")), callback='parse_1', follow=True),
    ]

    list_css_rules = { 
        '.site-listing': {
            'rank': '.count::text',
            'name': '.desc-paragraph a::text',
            'desc': '.description::text'
        }   
    }   

    content_css_rules = { 
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse '+response.url)
        x = self.parse_with_rules(response, self.list_css_rules, alexa_topsitesItem)
        # x = self.parse_with_rules(response, self.content_css_rules, dict)
        print(json.dumps(x, ensure_ascii=False, indent=2))
        # pp.pprint(x)
        return self.parse_with_rules(response, self.list_css_rules, alexa_topsitesItem)
class QuotesSpider(CrawlSpider):
    name = 'planet_char'
    rotate_user_agent = True
    allowed_domains = ['www.anime-planet.com']
    start_urls = ['https://www.anime-planet.com']

    rules = (Rule(sle(allow='/characters/', ), callback='parse_anime_links', follow=True),)

    def parse_anime_links(self, response):
        server = []
        server = anime_planet_character()

        server['name'] = \
            response.css('#siteContainer > h1::text').get()
        server['gender'] = \
            response.css('#siteContainer > section.pure-g.entryBar > div:nth-child(1)::text').get()
        server['hair_color'] = \
            response.css('#siteContainer > section.pure-g.entryBar > div:nth-child(2)::text').get()
        server['image_url'] = \
            response.css(
                '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1-2.md-1-3 > div > img::attr(src)').get()
        server['about'] = \
            response.css(
                '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1.md-2-3 > div::text').getall()
        server['tags'] = \
            response.css(
                '#siteContainer > section:nth-child(10) > div.pure-1.md-2-3 > div.pure-g.entrySynopsis > div.pure-1.md-2-3 > div.tags > ul > li > a::text').getall()

        return server
Example #18
0
class proxy4freeSpider(CommonSpider):
    name = "proxy4free"
    allowed_domains = ["proxy4free.com"]
    start_urls = ['http://www.proxy4free.com/list/webproxy1.html']
    rules = [
        Rule(sle(allow=("list/webproxy[0-9]+\.html")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        'tbody tr': {
            'domain':
            "td:nth-child(2) *::text",  #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"],
            #'port': 'td:nth-child(3) a::text',
            'country': 'td:nth-child(4) *::text',
            'rating': 'td:nth-child(5) *::text',
            'access_time': 'td:nth-child(6) *::text',
            'uptime': 'td:nth-child(7) *::text',
            'online_since': 'td:nth-child(8) *::text',
            'last_checked': 'td:nth-child(9) *::text',
            'features_hian': 'td:nth-child(10) *::text',
            'features_ssl': 'td:nth-child(11) *::text',
        }
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        items = []
        n = response.css('tbody tr')
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict, True)
        x = x[0]['tbody tr']
        pp.pprint(x)
Example #19
0
class templateSpider(CommonSpider):
    name = "template"
    allowed_domains = ["template.com"]
    start_urls = [
        "http://www.template.com/",
    ]
    rules = [
        Rule(sle(allow=(
            "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    list_css_rules_2 = {
        '#listZone .Q-tpWrap': {
            'url': '.linkto::attr(href)',
            'name': '.linkto::text'
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
Example #20
0
class proxylistorgSpider(CommonSpider):
    name = "proxylistorg"
    allowed_domains = ["proxy-list.org"]
    start_urls = ['https://proxy-list.org/english/index.php']
    rules = [
        Rule(sle(allow=("english/index.php(\?p=[0-9]+)?$")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        '#proxy-table .table ul': {
            'ip':
            "li:nth-child(1)::text",  #, "xpath:.//*[not(contains(@style,'display:none'))]/text()"],
            #'port': 'td:nth-child(3) a::text',
            'anonymity': 'li:nth-child(4)::text',
            'speed': 'li:nth-child(3)::text',
            'ssl': 'li:nth-child(2)::text',
            'country': 'li:nth-child(5) *::text',
        }
    }

    def parse_1(self, response):
        info('Parse ' + response.url)
        items = []
        n = response.css('tbody tr')
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict, True)
        x = x[0]['#proxy-table .table ul']
        pp.pprint(x)
Example #21
0
class musasiSpider(CommonSpider):
    name = "musasi"
    allowed_domains = ["http://www.musasi.jp"]
    start_urls = ["http://www.musasi.jp/"]
    rules = [
        Rule(sle(allow=("/explanation/[0-9]*/.*$")),
             callback='parse_1',
             follow=True),
    ]

    list_css_rules = {
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def start_requests(self):
        return [
            scrapy.FormRequest("/ichikawa-chuo/login",
                               formdata={
                                   'username': '******',
                                   'password': '******'
                               },
                               callback=self.parse_1)
        ]

    def parse_1(self, response):
        info('Parse ' + response.url)
Example #22
0
class DmozSpider(CrawlSpider):
    name = "dmoz"
    allowed_domains = ["dooland.com"]
    start_urls = [
        # "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
        # "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
        "http://www.dooland.com/magazine/article_784457.html"
    ]

    rules = [
        # Rule(sle(allow=('/Resources/')),callback='parse_details'),
        Rule(sle(allow=('/magazine/article_784457\.html')),
             callback='parseBooks',
             follow=False),
        # Rule(sle(allow=()), callback='parse_pages'),
        # 分页
        # Rule(sle(allow=(),restrict_xpaths=('//*[@id="page"]')),callback='parse_pages',follow=True)
    ]

    def parseBooks(self, response):
        for sel in response.xpath('//*[@id="main"]/article/div[1]/div[1]'):
            item = ArticleItem()
            item['url'] = sys.getdefaultencoding()
            # item['title'] = sys.getdefaultencoding()
            item['title'] = sel.xpath('h1/text()').extract()[0].strip()
            #
            #
            return item
Example #23
0
class QuotesSpider(CrawlSpider):

    name = 'anime_list'
    allowed_domains = ['myanimelist.net']
    start_urls = \
        ['https://myanimelist.net/anime/34134/One_Punch_Man_2nd_Season']

    rules = (Rule(sle(
        allow='https://myanimelist.net/anime/',
        deny=('userrecs', 'video', 'episode', 'characters', 'featured',
              'profile', 'login\.php', 'anime\.php', 'password\.php'
              'pressroom', 'people', 'advertising', 'producer', 'membership',
              'manga_translation_battle', 'watch', 'about\.php',
              'register\.php', 'genre', 'reviews', 'stats', 'forum', 'clubs',
              'character', 'news', 'modules')),
                  callback='parse_images',
                  follow=True), )

    def parse_images(self, response):
        image = []
        image = AjarWallAlphaItem()

        image['image_url'] = response.request.url
        image['image_pixels'] = \
            response.css('#contentWrapper > div:nth-child(1) > h1 > span::text'
                         ).get()

        return image
Example #24
0
class PictureSpider(CrawlSpider):
    name = "picture"
    allowed_domains = ["tumblr.com"]
    start_urls = [
        "https://nanue1.tumblr.com/",
    ]
    rules = [
        #Rule(sle(allow=(r"/likes")),callback='parse_likes',follow = True),
        #Rule(sle(allow=(r"/likes")),callback='parse_likes'),
        Rule(sle(allow=(r"/following")),
             callback='parse_following',
             follow=True),
        Rule(sle(allow=(r"/archive")), callback='parse_archive', follow=True),
    ]

    def get_archive_post_pic(self, response):
        print "Parse archive post page for pic......"
        #pic_page_urls=re.findall('https://\d+.media.tumblr.com/\S+/\S+_\d+.jpg',response.body)
        pic_page_urls = re.findall(
            'https://\d+.media.tumblr.com/\S+/\S+_1280.jpg', response.body)
        if pic_page_urls:
            item = TumblrPictureItem()
            for pic_page_url in pic_page_urls:
                print pic_page_url
                item['url'] = [urlparse.urljoin(response.url, pic_page_url)]
                yield item

    def parse_archive(self, response):
        print "Parse archive......"
        print response.url
        archive_post_urls = response.xpath(
            '//*[@class="hover"]/@href').extract()
        for archive_post_url in archive_post_urls:
            req = Request(archive_post_url,
                          dont_filter=True,
                          callback=self.get_archive_post_pic)
            yield req

    def parse_following(self, response):
        print "Parse following user......"
        follows_blog_urls = response.xpath(
            '//*[@class="blog-name"]/@href').extract()
        for follow_blog_url in follows_blog_urls:
            req = Request(follow_blog_url, dont_filter=True)
            print follow_blog_url
            yield req
Example #25
0
class XiCiDaiLiSpider(CrawlSpider):
    name = 'xicidaili'
    allowed_domains = ['www.xicidaili.com']
    start_urls = [
        'http://www.xicidaili.com/nn',
        'http://www.xicidaili.com/nt',
        'http://www.xicidaili.com/wn'
    ]
     
    rules = [
        Rule(sle(allow=("/nn/[\d]{1,2}$")), callback='parse_proxy', follow=True),
        Rule(sle(allow=("/nt/[\d]{1,2}$")), callback='parse_proxy', follow=True),
        Rule(sle(allow=("/wn/[\d]{1,2}$")), callback='parse_proxy', follow=True),
    ]
    
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    } 
    
    def start_requests(self): 
        for url in self.start_urls:
            yield Request(url, headers=self.headers, dont_filter=True)
                
    def parse_proxy(self, response):
        logger.debug('parse_proxy: '+response.url);
        table = Selector(response).xpath('//table[@id="ip_list"]')[0]
        trs = table.xpath('//tr')[1:]   #去掉标题行        
        
        items = [];
        for tr in trs: 
            item = XiCiDaiLiItem();
            item['ip'] = tr.xpath('td[2]/text()').extract()[0]
            item['port'] = tr.xpath('td[3]/text()').extract()[0]
            item['position'] = tr.xpath('string(td[4])').extract()[0].strip()
            item['type'] = tr.xpath('td[6]/text()').extract()[0]
            item['speed'] = tr.xpath('td[7]/div/@title').re('\d+\.\d*')[0]
            item['last_check_time'] = tr.xpath('td[10]/text()').extract()[0]
            
            if 'http' in item['type'].lower(): 
                logger.info('function: parse_proxy, url: '+response.url+' , item: '+str(item));
                items.append(item)
            
        return items;
Example #26
0
class qqnewsSpider(CommonSpider):
    name = "qqnews"
    allowed_domains = ["tencent.com", 'qq.com']
    start_urls = ['http://news.qq.com/society_index.shtml']
    rules = [
        Rule(sle(allow=('society_index.shtml')),
             callback='parse_0',
             follow=True),
        Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True),
    ]

    list_css_rules = {
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    list_css_rules_2 = {
        '#listZone .Q-tpWrap': {
            'url': '.linkto::attr(href)',
            'name': '.linkto::text'
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_0(self, response):
        info('Parse0 ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        pp.pprint(x)
        #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem)

    def parse_1(self, response):
        info('Parse1 ' + response.url)
        x = self.parse_with_rules(response, self.content_css_rules, dict)
        pp.pprint(x)
        #import pdb; pdb.set_trace()

    @staticmethod
    def parse_2(response):
        info('Parse2 ' + response.url)
Example #27
0
class QuotesSpider(CrawlSpider):

    name = 'pdf'
    allowed_domains = ['www.pdfdrive.com']
    start_urls = ['https://www.pdfdrive.com/']

    rules = (Rule(sle(allow='', deny=('/auth/', '/home/', '/search\?',
             '/category/')), callback='parse_anime', follow=True), )

    def parse_anime(self, response):
        item = []
        item = pdfdrive()

        # item['name'] = \
        #     response.css('#main > div > div.widget.info > div > div:nth-child(1)'
        #                  ).get()

        item['main_url'] = response.url
        item['name'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > h1::text'
                         ).get()
        item['image_url'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-left > a > img::attr(src)'
                         ).get()
        item['year_pub'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(3)::text'
                         ).get()
        item['total_pages'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(1)::text'
                         ).get()
        item['book_size'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(5)::text'
                         ).get()
        item['no_downloads'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span.info-green.hidemobile::text'
                         ).get()
        item['book_language'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-file-info > span:nth-child(9)::text'
                         ).get()
        item['book_id'] = \
            response.css('#previewButtonMain::attr(data-id)').get()
        item['book_preview'] = \
            response.css('#previewButtonMain::attr(data-preview)').get()
        item['book_buy'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-buttons > div > a::attr(href)'
                         ).get()
        item['book_quotes'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.quotes::text'
                         ).get()
        item['book_author'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-author > a > span::text'
                         ).get()
        item['book_tags'] = \
            response.css('body > div.dialog > div.dialog-main > div.dialog-left > div.ebook-main > div.ebook-right > div > div.ebook-tags > a::text'
                         ).getall()
        item['download_url'] = \
            response.css('#download-button-link::attr(href)').get()
        return item
Example #28
0
class youku_Spider(CommonSpider):
    name = "movies_spider"
    allowed_domains = ["list.youku.com", "v.youku.com"]
    start_urls = [
        "http://list.youku.com/category/show/c_97.html",
        "http://list.youku.com/category/show/c_96.html",
        "http://list.youku.com/category/show/c_100.html",
    ]
    rules = [
        Rule(sle(allow=(
            "list.youku.com/category/show/c_[10967]+_?[_a-z0-9]*\.html")),
             callback='parse_1',
             follow=True),
        Rule(sle(allow=("v.youku.com/v_show/id_[\S+]+.html.*?")),
             callback='parse_tv',
             follow=True),
    ]

    # 列表页面用
    content_css_rules = {
        'title': 'div.p-thumb a::attr(title)',
        'url': 'div.p-thumb a::attr(href)',
        'bg_img_url': 'div.p-thumb img::attr(src)',
        # 'images': '#Cnt-Main-Article-QQ img::attr(src)',
        # 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    # 播放页面用
    player_css_rules = {
        'category': 'h1.title a::text',
        'title': 'div.tvinfo h3::text',
        'sub_title': '#vpofficiallistv5_wrap div.items .item::attr(title)',
        'seq': '#vpofficiallistv5_wrap div.items .item::attr(seq)',
        'url': '#vpofficiallistv5_wrap div.items .item a::attr(href)',
    }

    def parse_1(self, response):
        # info('Parse ' + response.url)
        x = self.parse_with_rules(response, self.content_css_rules, dict)
        # print(json.dumps(x, ensure_ascii=False, indent=2))
        return x

    def parse_tv(self, response):
        x = self.parse_with_rules(response, self.player_css_rules, dict)
        return x
Example #29
0
 def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs):
     cm  = __import__(conf_module, globals=globals())
     conf = cm.Config()
     self.name = conf.name
     self.allowed_domains = conf.allowed_domains
     self.start_urls = conf.start_urls
     self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=True) for c in conf.ex_rules]
     info(self.start_urls)
     info(self.rules)
     super(general_spiderSpider, self).__init__(*args, **kwargs)
Example #30
0
class sinanewsSpider(CommonSpider):
    name = "sinanews"
    allowed_domains = ["news.sina.com.cn"]
    start_urls = [
        "http://news.sina.com.cn/",
    ]
    rules = [
        Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'),
        Rule(sle(allow=(".*doc[^/]*shtml$")), callback='parse_1'), #, follow=True),
        #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'),
    ]

    list_css_rules = {
        '#blk_yw_01 a': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    content_css_rules = {
        'text': 'p::text',
        'images': 'img::attr(src)',
        'images-desc': '.img_descr::text',
        # need url analysis for video
        #'video': '#J_Article_Player',
    }

    def process_request(self, r):
        info('process '+str(r))
        return r
    
    def parse_0(self, response):
        info('Parse 0 '+response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        pp.pprint(x)
        #pdb.set_trace()
        #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem)

    def parse_1(self, response):
        info('Parse 1 '+response.url)
        x = self.parse_with_rules(response, self.content_css_rules, dict)
        pp.pprint(x)
Example #31
0
class LianjiaSpider(scrapy.Spider):
    name = "lianjia"
    allowed_domains = ["lianjia.com"]
    start_urls = ["http://tj.lianjia.com/ershoufang/pg2/"]

    # response.css('.page-box .house-lst-page-box::attr(page-url)').extract()
    # response.css('.page-box .house-lst-page-box::attr(page-data)').extract()

    rules = (
        Rule(sle(allow=('http://tj.lianjia.com/ershoufang/pg\d+$')),
             callback='parse_item'),

        #  items = LinkExtractor(allow=('/ershoufang/pg2')).extract_links(response)
        #  for i in items:
        #       print i
        #

        # Rule(SgmlLinkExtractor(allow=('huhuuu/p/',)), callback='parse_item'),
        # Rule(SgmlLinkExtractor(allow=('huhuuu/archive/',)), callback='parse_item'),
    )

    # rules = [
    #     Rule(sle(allow=("/pg\d+$")), callback='parse', follow=True),
    # ]

    def parse(self, response):
        filename = response.url.split("/")[-2]
        with open(filename, 'wb') as f:
            f.write(response.body)

    # def parse(self, response):
    #     items = []
    #     sel = Selector(response)
    #     sites = sel.css('.info')
    #     for site in sites:
    #         item = TutorialItem()
    #         item['title'] = site.css('.info .title a::text').extract()
    #         items.append(item)
    #
    #         print item['title']
    #     return items

    def parse_item(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('.info')
        for site in sites:
            item = TutorialItem()
            item['title'] = site.css('.info .title a::text').extract()
            items.append(item)

            print item['title']
        return items
class DoubanBookSpider(CrawlSpider):
    name = "douban_book"
    allowed_domains = ["douban.com"]
    start_urls = ["http://book.douban.com/tag/"]

    rules = (
        Rule(sle(allow=("/tag/[^/]+/?$", )), callback="parse_1"),
        Rule(sle(allow=("/tag/$", )),
             follow=True,
             process_request='_process_request'),
    )

    # NOTE: depth index is hidden.
    depth_class_list = [
        '.*/tag/?$',
        '.*/tag/.+/?',
    ]

    def _cal_depth(self, response):
        """
        Calculate the depth of response, and call corresponding method or stop
        crawl.
        """
        url = response.url
        for depth, depth_regexp in enumerate(self.depth_class_list):
            if re.match(depth_regexp, url):
                return depth
        # warn("Unknown url depth: " + url)
        # If the url pattern is unknown, then return -1.
        return -1

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request

    '''
Example #33
0
    def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs):
        if conf_module.endswith(".py"):
            conf_module = conf_module[:-3]
        cm  = __import__(conf_module, globals=globals())
        conf = cm.Config()
        self.name = conf.name
        self.allowed_domains = conf.allowed_domains
        self.start_urls = conf.start_urls
        self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=conf.follow) for c in conf.ex_rules]
        self.conf = conf

        info(self.start_urls)
        info(self.rules)
        info([[c.allowed_rule_regex, c.paras] for c in conf.ex_rules])
        # import pdb; pdb.set_trace()
        super(general_json_spiderSpider, self).__init__(*args, **kwargs)
Example #34
0
 def __init__(self,software,*args,**kwargs):
     self.software = software
     if software == "wireshark" or software == "FFmpeg" or software == "openssl":
         self.start_urls = ['https://github.com/%s/%s/commits/master/?page=0' % (software,software),'https://github.com/%s/%s/commits/master/?page=101' % (software,software)]
         self.rules = [
             Rule(sle(allow=('/%s/%s/commit/[a-f0-9]+' % (software,software)),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'),
             Rule(sle(allow=('https://github.com/%s/%s/commits/master\?page=\d+' % (software,software)),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))),
         ]
     elif software == "httpd":
         self.start_urls = ['https://github.com/apache/%s/commits/trunk?page=0' % software,'https://github.com/apache/%s/commits/trunk?page=101' % software]
         self.rules = [
             Rule(sle(allow=('/apache/%s/commit/[a-f0-9]+' % software),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'),
             Rule(sle(allow=('https://github.com/apache/%s/commits/trunk\?page=\d+' % software),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))),
         ]
     elif software == "linux":
         self.start_urls = ['https://github.com/torvalds/%s/commits/master' % software,'https://github.com/torvalds/%s/commits/master?page=101' % software]
         self.rules = [
             Rule(sle(allow=('/torvalds/%s/commit/[a-f0-9]+' % software),restrict_xpaths=('//a[@class="message"]')),follow=True,callback='parse_item'),
             Rule(sle(allow=('https://github.com/torvalds/%s/commits/master\?page=\d+' % software),restrict_xpaths=('//div[@class="pagination"]/a[contains(.,"Older")]'))),
         ]
     super(SpiderGithub,self).__init__()
Example #35
0
 def __init__(self, forum_id=58, digit=1, *args, **kwargs):
     self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
     self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),]
     super(sisSpider, self).__init__(*args, **kwargs)