Ejemplo n.º 1
0
class Zsk(CrawlSpider):
    name = "zsk"
    allowed_domains = ["csdn.net"]
    start_urls = ["http://lib.csdn.net"]
    rules = [  # 定义爬取URL的规则
        # (没有callback意味着follow默认为True)
        #Rule(LinkExtractor(allow=('/base/\w+'),deny=('/base/\w+/resource/\w+'))),
        Rule(sle(allow=("/bases/\w+")), follow=True, callback='parse_item'),
        Rule(sle(allow=("/base/\w+")), follow=False, callback='parse_item')
    ]

    def parse_item(self, response):
        #print response.url
        #logtxt = open("F:\\HW\\scrapyzsk\\csdnzsk\\zsk.txt",'a+')
        #logtxt.write(response.url+'\r\n')
        #logtxt.close()
        try:
            hxs = HtmlXPathSelector(response)
            item = csdnzsk.items.CsdnzskItem()
            item['name'] = hxs.select(
                "//div[@class='banner_log']/em/text()")[0].extract()
            item['url'] = response.url
            print """**********%s\r\n""" % response.url
            return item
        except Exception, e:
            print Exception, ":", e
Ejemplo n.º 2
0
class HrtencentSpider(CrawlSpider):
    name = "hrtencent"
    allowed_domains = ["tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10)
    ]
    rules = [
        Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'),
        Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1')
    ]

    def parse_2(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('.tablelist')
        for site in sites:
            item = PositionDetailItem()
            item['sharetitle'] = site.css('.h #sharetitle::text').extract()
            item['bottomline'] = site.css('.bottomline td::text').extract()
            # item['duty'] = site.css('.c .l2::text').extract()
            item['link'] = response.url
            items.append(item)
            print repr(item).decode("unicode-escape") + '\n'
        # info('parsed ' + str(response))
        self.parse_1(response)
        return items

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request
class HrtencentSpider(CrawlSpider):
    name = "hrtencent"
    allowed_domains = ["dewen.io"]
    start_urls = [
        "http://www.dewen.io/questions?page=%d" % d for d in range(1, 10, 1)
    ]
    rules = [
        Rule(sle(allow=("/q/\d*")), callback='parse_2'),
        Rule(sle(allow=("/questions?page=\d{,4}")),
             follow=True,
             callback='parse_1')
    ]

    def parse_2(self, response):
        items = []
        item = PositionDetailItem()
        sel = Selector(response)
        site = sel.css('.container')
        item['sharetitle'] = site.css('#title::text').extract()
        item['description'] = site.css('#qst_content').extract()
        #item['duty'] = site.css('.c .l2::text').extract()
        item['link'] = response.url
        item['tags'] = site.css('#topic a::text').extract()

        #get content images url
        images_1 = sel.css('#qst_content img::attr(loadsrc)').extract()
        images_2 = sel.css('.post_area img::attr(loadsrc)').extract()
        item['image_urls'] = images_1 + images_2

        answers = []

        an_articles = site.css('.ans_item')
        for an_article in an_articles:
            answer = {}
            answer['description'] = an_article.css('.post_area').extract()
            answer['votes'] = an_article.css('.voting::attr(score)').extract()
            if an_article.css('.best_ans_text').extract():
                answer['chosen'] = 1
            else:
                answer['chosen'] = 0
            answers.append(answer)

        #item['answers'] = answers

        item['answers'] = answers

        items.append(item)

        print repr(item).decode("unicode-escape") + '\n'
        # info('parsed ' + str(response))
        self.parse_1(response)
        return items

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request
Ejemplo n.º 4
0
class tmtSpider(CrawlSpider):
    name = 'tmt'

    allowed_domains = ['www.tmtpost.com']

    start_urls = ['http://www.tmtpost.com/lists/hot_list']

    rules = [
        # 人们话题一共有5页,每页10篇文章
        Rule(sle(allow=('www.tmtpost.com/hot/\d+')), follow=True),
        Rule(sle(allow=('www.tmtpost.com/\d+.html')), callback='parse_tmt')
    ]

    def parse_tmt(self, response):
        items = []

        sel = Selector(response)

        sites = sel.xpath('/html')

        for site in sites:
            item = TmtItem()

            item['product_url'] = response.url
            item['title'] = site.xpath('//article/h1/text()').extract()

            # web上使用 Xpath Checker可以用//article//span[@class="time"]/text()来提取出时间
            # 但Scrapy抓取却无法抓取到,并且它也不属于js生成内容,interesting
            item['pub_date'] = site.xpath(
                '//div[@class="post-info"]/span[2]/text()').extract()
            item['intro_content'] = site.xpath(
                '//p[@class="post-abstract"]/text()').extract()

            items.append(item)
        return items
Ejemplo n.º 5
0
class DoubanBookSpider(CrawlSpider):
    name = "doubanbook"
    allowed_domains = ["douban.com"]
    start_urls = [
        "http://book.douban.com/tag/"
    ]
    rules = [
        Rule(sle(allow=("/subject/\d+/?$")), callback='parse_2'),
        Rule(sle(allow=("/tag/[^/]+/?$", )), follow=True),
        Rule(sle(allow=("/tag/$", )), follow=True),
    ]

    def parse_2(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('#wrapper')
        for site in sites:
            item = DoubanSubjectItem()
            item['title'] = site.css('h1 span::text').extract()
            item['link'] = response.url
            item['content_intro'] = site.css('#link-report .intro p::text').extract()
            items.append(item)
            print repr(item).decode("unicode-escape") + '\n'
            # print item
        # info('parsed ' + str(response))
        return items

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request
Ejemplo n.º 6
0
class IcibaSpider(CrawlSpider):
    name = 'iciba'
    allowed_domains = ['iciba.com']
    start_urls = ['http://news.iciba.com/dailysentence/history.html']

    rules = [
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        Rule(sle(allow=(
            "/appv3/wwwroot/ds.php\?action=history&ob=1&order=2&page=\d+#nav",
        )),
             follow=True),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(sle(allow=("/dailysentence/detail-\d+.html#nav")),
             callback='parse_item'),
    ]

    def parse_item(self, response):
        self.logger.info('Hi, this is an item page! %s', response.url)
        item = IcibaspiderItem()
        item['en'] = response.xpath(
            "//li[@class='en']/descendant::text()").extract()
        item['cn'] = response.xpath(
            "//li[@class='cn']/descendant::text()").extract()
        item['url'] = response.url

        return item
Ejemplo n.º 7
0
class w61856Spider(CrawlSpider):
    siteid = 1  #采集数据的站点id
    sitename = u'鲜直达'  #采集数据的站点名称
    name = "XianZhiDa"
    allowed_domains = ["61856.com"]
    start_urls = [
        "http://www.61856.com/category.php?id=16",
        "http://www.61856.com/category.php?id=17"
    ]
    # rules = [ # 定义爬取URL的规则
    #     Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True, callback='parse_item')
    # ]
    rules = [  # 定义爬取URL的规则
        Rule(sle(allow=(
            "/category.php\?id=16&price_min=0&price_max=0&page=\d{,4}&sort=sort_order&order=DESC"
        )),
             follow=True,
             callback='parse_item'),
        Rule(sle(allow=(
            "/category.php\?id=17&price_min=0&price_max=0&page=\d{,4}&sort=sort_order&order=DESC"
        )),
             follow=True,
             callback='parse_item')
    ]

    def parse_item(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        log.start(logfile='log.txt', loglevel=log.WARNING)
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        catalog = sel.css('div.box_1 div.sp_13').xpath('text()').extract()[0]
        sites = sel.css('div.centerPadd div.sp_16')
        for site in sites:
            item = GuoShuItem()
            item['siteid'] = self.siteid
            item['sitename'] = self.sitename
            item['name'] = site.css('p a').xpath('text()').extract()[0]
            relative_url = site.css('p a').xpath('@href').extract()[0]
            item['detailurl'] = urlparse.urljoin(
                base_url, relative_url)  #urljoin_rfc(base_url, relative_url)
            item['catalog'] = catalog
            item['guige'] = site.css('.shop').xpath('text()').extract()[0]
            price = site.css('.shop_s2').xpath('text()').extract()
            item['price'] = price[0].split('/')[0].replace("¥", "")
            item['danwei'] = price[0].split('/')[1]
            items.append(item)
            # print repr(item).decode("unicode-escape") + '\n'
            # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING)

        # info('parsed ' + str(response))
        return items

    def _process_request(self, request):
        # info('process ' + str(request))
        return request
Ejemplo n.º 8
0
class alexaCNSpider(CrawlSpider):
    name = "alexa.cn"
    allowed_domains = ["alexa.com"]
    start_urls = [
        "http://www.alexa.com/",
        "http://www.alexa.com/topsites/category/World/Chinese_Simplified_CN",
    ]
    rules = [
        Rule(sle(allow=(
            "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")),
             callback='parse_category_top_xxx',
             follow=True),
        Rule(sle(allow=("/topsites/category/World/Chinese_Simplified_CN$", )),
             callback='parse_category_top_xxx',
             follow=True),
        #Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people', follow=True),
    ]

    # www.alexa.com/topsites/category/Top/Computers
    # www.alexa.com/topsites/category;1/Top/Computers
    def parse_category_top_xxx(self, response):
        info('parsed ' + str(response))
        items = []
        sel = Selector(response)

        sites = sel.css('.site-listing')
        for site in sites:
            item = alexaSiteInfoItem()
            item['url'] = site.css(
                'a[href*=siteinfo]::attr(href)')[0].extract()
            item['name'] = site.css('a[href*=siteinfo]::text')[0].extract()
            item['description'] = site.css('.description::text')[0].extract()
            remainder = site.css('.remainder::text')
            if remainder:
                item['description'] += remainder[0].extract()
            # more specific
            item['category'] = urllib.unquote('/'.join(
                response.url.split('/')[-3:])).decode('utf-8')
            items.append(item)
        return items

    def parse_category_top(self, response):
        info('parsed ' + str(response))
        items = []
        sel = Selector(response)

        categories = sel.css('li a[href*="/topsites/category/Top/"]')
        for category in categories:
            item = alexaCategoryItem()
            item['url'] = category.css('::attr(href)')[0].extract()
            item['name'] = category.css('::text')[0].extract()
            items.append(item)
        return items
Ejemplo n.º 9
0
class MiHeSpider(CrawlSpider):
    siteid = 3  #采集数据的站点id
    sitename = u'米禾'  #采集数据的站点名称
    name = "MiHe"
    allowed_domains = ["ranlixu.com"]
    start_urls = [
        "http://www.ranlixu.com/class.asp?larCode=1",
        "http://www.ranlixu.com/class.asp?larCode=701"
        # "http://www.ranlixu.com/list.asp?ProdId=A03026"
    ]
    rules = [  # 定义爬取URL的规则
        Rule(sle(allow=("/class.asp\?larCode=1&Page=\d{,4}")), follow=True),
        Rule(sle(allow=("/class.asp\?larCode=701&Page=\d{,4}")), follow=True),
        Rule(sle(allow=("/list.asp\?ProdId=G\d{,10}")),
             follow=True,
             callback='parse_item'),
        Rule(sle(allow=("/list.asp\?ProdId=A\d{,10}")),
             follow=True,
             callback='parse_item')
    ]

    def parse_item(self,
                   response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 parse_item
        log.start(logfile='log.txt', loglevel=log.WARNING)
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        catalog = sel.css('div.cc').xpath('text()').extract()[2]
        catalog = catalog[catalog.index(u'品牌:'):].replace("\r\n", "").replace(
            "品牌:", "").lstrip().rstrip()
        item = GuoShuItem()
        item['siteid'] = self.siteid
        item['sitename'] = self.sitename
        item['name'] = sel.css('div.cc h2').xpath('text()').extract()[0]
        item['detailurl'] = base_url
        item['catalog'] = catalog
        item['guige'] = sel.css('div.cc b').xpath('text()').extract()[0]
        price = sel.css('div.cc').xpath(
            './/font[@color="red"]/text()').extract()[0]
        item['price'] = price
        item['danwei'] = item['guige']
        items.append(item)
        # print repr(item).decode("unicode-escape") + '\n'
        # log.msg('item %s' % repr(item).decode("unicode-escape"),level=log.WARNING)

        # info('parsed ' + str(response))
        return items

    def _process_request(self, request):
        # info('process ' + str(request))
        return request
Ejemplo n.º 10
0
class amazonbookSpider(CommonSpider):
    name = "amazonbook"
    allowed_domains = ["amazon.com", "www.amazon.com"]
    start_urls = [
        #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155",
        "http://www.amazon.com/books-used-books-textbooks/b?node=283155",
    ]
    rules = [
        #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True),
        Rule(sle(allow=("/books-used-books-textbooks/.*")), callback='parse_0', follow=True),
    ]

    css_rules = {
        ".inner .a-row": {
            "url": ".title::attr(href)",
            #"desc": "span::text"
            "title": ".s9TitleText::text",
            "comments": ".a-icon-row .a-size-small::text",
        }
    }

    def parse_0(self, response):
        info('Parse 0 '+response.url)
        pp.pprint(self.parse_with_rules(response, self.css_rules, dict))

    #.inner .a-row
    def parse_1(self, response):
        info('Parse 1 '+response.url)
Ejemplo n.º 11
0
class templateSpider(CommonSpider):
    name = "template"
    allowed_domains = ["template.com"]
    start_urls = [
        "http://www.template.com/",
    ]
    rules = [
        Rule(sle(allow=("/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_1', follow=True),
    ]

    list_css_rules = { 
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }   
    }   

    list_css_rules_2 = { 
        '#listZone .Q-tpWrap': {
            'url': '.linkto::attr(href)',
            'name': '.linkto::text'
        }   
    }   

    content_css_rules = { 
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_1(self, response):
        info('Parse '+response.url)
Ejemplo n.º 12
0
class DoubanBookSpider(CrawlSpider):
    name = "ypgs"
    allowed_domains = ["07938.com"]
    start_urls = ["http://www.07938.com/zheligushi/"]
    rules = [
        Rule(sle(allow=("/\d+.html$")), callback='parse_2'),
        # Rule(sle(allow=("/tag/[^/]+/?$", )), follow=True),
        # Rule(sle(allow=("/tag/$", )), follow=True),
    ]

    def parse_2(self, response):
        item = StoryItem()
        sel = Selector(response)
        item['title'] = sel.css("h1::text")
        content = sel.css(".content")
        # TODO 内容格式处理
        item['content'] = self.process_content(content)
        # info('parsed ' + str(response))
        return item

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request

    def process_content(self, content):
        # TODO process content

        return content
Ejemplo n.º 13
0
class DmozSpider(CrawlSpider):
    name = "dmoz"
    allowed_domains = ["tencent.com"]
    start_urls = ["http://hr.tencent.com/position.php"]
    rules = [
        Rule(sle(allow=("/position.php\?&start=\d{,4}#a")),
             follow=True,
             callback='parse_item')
    ]

    def parse_item(self, response):
        p = Pinyin()
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = Website()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            item['description'] = site.css(
                'tr > td:nth-child(2)::text').extract()[0]
            url = site.css('tr > td:nth-child(4)::text').extract()[0]
            item['url'] = p.get_pinyin(url, u'')
            item['address'] = url
            item['num'] = int(
                site.css('tr > td:nth-child(3)::text').extract()[0])
            item['date'] = site.css('tr > td:nth-child(5)::text').extract()[0]
            item['uid'] = item['date'] + '-' + url + '-' + item['name']
            items.append(item)
        return items
Ejemplo n.º 14
0
class MySpider(CrawlSpider):
    name = "taobao"
    allowed_domains = ["taobao.com"]
    start_urls = [
        "https://www.taobao.com",
    ]
    rules = [ # 定义爬取URL的规则
              # 在起始页面的下一级进行查找
        Rule(sle(allow=("https://www\.taobao\.com/market/.+"))),
        Rule(sle(allow=("item\.taobao\.com/item\.htm\?spm=.+")), follow=True, callback='parse_item')

    ]
    def parse_item(self, response):
        items = []
        # // filename = response.url.split("/")[-2]
        print "taobao =========================%s"%response.url
        return items
Ejemplo n.º 15
0
class Jiandan_net(CrawlSpider):
    """定义蜘蛛功能"""
    name = "jiandan"
    allowed_domains = ['jandan.net']
    start_urls = [  #起始抓取URL
        "http://jandan.net/new"
    ]

    rules = (
        Rule(sle(allow=(r"/[\d]{4}/[\d]{2}/[\d]{2}/[\d\w-]{0,50}\.html$")),
             callback="prase_detail"),
        # Rule(sle(allow=(r"/page/[23]"))),
        Rule(sle(deny=(r"/tag/.*")), follow=False),
        # Rule(sle(deny=(r"/author/.*"))),
        Rule(sle(deny=(r"/page/.*")), follow=False),
        Rule(sle(deny=(r"(/v|/duan|/pic|/guanyu|/feed|/app|/|/author/.*)$")),
             follow=False),
    )

    def prase_detail(self, response):

        Items = []
        resBody = Selector(text=response.body)
        base_url = get_base_url(response)  #获取源网址成功
        print '***********************************', base_url
        title = resBody.xpath('//title/text()').extract()[0]  #获取标题成功
        publishtime = resBody.xpath('//div[@class="time_s"]/text()').re(
            "\s@\s(.*)\s,\s(.*)")  #获取发布时间(需处理)
        author = resBody.xpath('//div[@class="time_s"]/a/text()').extract()[
            0]  #获取作者成功
        content = resBody.xpath(
            '//div[@class="post f"]//p').extract()  #获取网页主内容成功

        item = Jiandan()
        item['title'] = title
        item['source_url'] = base_url
        item['publish_time'] = publishtime
        item['source_author'] = author
        item['content'] = content

        # file_d = open("./jiandan.txt",'a+')
        # mark = len(file_d.readlines())+1
        # file_d.write(str(mark)+"    "+str(get_base_url(response))+title.encode('utf-8')+str(content)+"\n")
        # file_d.close()
        return item
Ejemplo n.º 16
0
class qqnewsSpider(CommonSpider):
    name = "qqnews"
    allowed_domains = ["tencent.com", 'qq.com']
    start_urls = ['http://news.qq.com/society_index.shtml']
    rules = [
        Rule(sle(allow=('society_index.shtml')),
             callback='parse_0',
             follow=True),
        Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True),
    ]

    list_css_rules = {
        '.linkto': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    list_css_rules_2 = {
        '#listZone .Q-tpWrap': {
            'url': '.linkto::attr(href)',
            'name': '.linkto::text'
        }
    }

    content_css_rules = {
        'text': '#Cnt-Main-Article-QQ p *::text',
        'images': '#Cnt-Main-Article-QQ img::attr(src)',
        'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
    }

    def parse_0(self, response):
        info('Parse0 ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        pp.pprint(x)
        #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem)

    def parse_1(self, response):
        info('Parse1 ' + response.url)
        x = self.parse_with_rules(response, self.content_css_rules, dict)
        pp.pprint(x)
        #import pdb; pdb.set_trace()

    def parse_2(self, response):
        info('Parse2 ' + response.url)
Ejemplo n.º 17
0
 def __init__(self, forum_id=58, digit=1, *args, **kwargs):
     self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
     self.rules = [
         Rule(sle(allow=("/forum/forum-" + forum_id + "-[0-9]{," + digit +
                         "}\.html")),
              follow=True,
              callback='parse_1'),
     ]
     super(sisSpider, self).__init__(*args, **kwargs)
Ejemplo n.º 18
0
class TencentSpider(CrawlSpider):
    name = "tencent"
    allowed_domains = ["tencent.com"]
    start_urls = ["http://hr.tencent.com/position.php"]
    rules = [  # 定义爬取URL的规则
        Rule(sle(allow=("/position.php\?&start=\d{,4}#a")),
             follow=True,
             callback='parse_item')
    ]

    def parse(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            yield self.parsePages(site, base_url)
        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
            yield self.parsePages(site, base_url)

        next_page = sel.css(
            'table.tablelist tr.f #next ::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page),
                                 callback=self.parse)

    def parsePages(self, site, base_url):
        item = TencentItem()
        item['name'] = site.css('.l.square a ::text').extract_first()
        relative_url = site.css('.l.square a').xpath('@href').extract()[0]
        item['detailLink'] = urljoin_rfc(base_url, relative_url)
        item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
        item['location'] = site.css('tr > td:nth-child(4)::text').extract()
        item['number'] = site.css('tr > td:nth-child(3)::text').extract()
        item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()

        return scrapy.Request(item['detailLink'],
                              meta={'item': item},
                              callback=self.parseDetail)

    def parseDetail(self, response):
        sel = Selector(response)
        item = response.meta['item']

        responsibilities = []
        lis = sel.css('table.tablelist tr:nth-child(3) ul li')
        for li in lis:
            responsibilities.append(li.css('::text').extract_first())
        item['responsibilities'] = responsibilities

        requirements = []
        lis = sel.css('table.tablelist tr:nth-child(3) ul li')
        for li in lis:
            requirements.append(li.css('::text').extract_first())
        item['requirements'] = requirements
        return item
Ejemplo n.º 19
0
class sinanewsSpider(CommonSpider):
    name = "sinanews"
    allowed_domains = ["news.sina.com.cn"]
    start_urls = [
        "http://news.sina.com.cn/",
    ]
    rules = [
        Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'),
        Rule(sle(allow=(".*doc[^/]*shtml$")),
             callback='parse_1'),  #, follow=True),
        #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'),
    ]

    list_css_rules = {
        '#blk_yw_01 a': {
            'url': 'a::attr(href)',
            'name': 'a::text',
        }
    }

    content_css_rules = {
        'text': 'p::text',
        'images': 'img::attr(src)',
        'images-desc': '.img_descr::text',
        # need url analysis for video
        #'video': '#J_Article_Player',
    }

    def process_request(self, r):
        info('process ' + str(r))
        return r

    def parse_0(self, response):
        info('Parse 0 ' + response.url)
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        pp.pprint(x)
        #pdb.set_trace()
        #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem)

    def parse_1(self, response):
        info('Parse 1 ' + response.url)
        x = self.parse_with_rules(response, self.content_css_rules, dict)
        pp.pprint(x)
Ejemplo n.º 20
0
class MySpider(CrawlSpider):
    name = "jingdong"
    allowed_domains = ["jd.com","3.cn"]
    start_urls = [
        "http://www.jd.com/",
    ]
    # Todo:summary CrawSpider中的rules 调用父类的parse 从第一级页面开始进行抓取。注意页面跳转及过滤的正则,要按顺序依次放入列表
    # 逻辑上似乎已经默认调用父类的parse代码逻辑对页面中的链接按rules中的规则开始分析
    rules = ( # 定义爬取URL的规则
         Rule(sle(allow=("http://channel.jd.com/.+\.html"))),
         Rule(sle(allow=("http://item.jd.com/\d+\.html")), follow=True, callback='parse_item'),
              )
    # def parse_page(self, response):
    #     print response.url
    #     yield Request(response.url,callback=self.parse_item )

    def parse_item(self,response):
        sel = Selector(response)
        filename = response.url.split("/")[-1]
        item = JingdongItem()
        item["url"] = [response.url]
        item["name"] = sel.xpath('//*[@id="name"]/h1/text()').extract()
        # js生成的价格。。。
        # item["price"] = sel.xpath('//div[2]/div[2]/strong/text()').extract()
        # 参考 网文 http://blog.csdn.net/lanshanlei/article/details/42741179
        productid = os.path.splitext(filename)[-2]  #response.url[19:29]
        priceUrl = 'http://p.3.cn/prices/mgets?skuIds=J_' + productid + 'J_'
        r = Request(priceUrl,callback= self.parsePrice)
        r.meta['item'] = item
        yield r

    def parsePrice(self,response):
        sel = Selector(response)
        item = response.meta['item']
        try:
            price = sel.xpath("//text()").extract()[0].encode('utf-8').split('"')[7]
        except Exception,ex:
            print ex
            price = -2

        item['price'] = [price]
        return item
Ejemplo n.º 21
0
class TencentSpider(CrawlSpider):
    name = "tencent"
    allowed_domains = ["tencent.com"]
    start_urls = ["http://hr.tencent.com/position.php"]
    rules = [  # 定义爬取URL的规则
        Rule(sle(allow=("/position.php\?&start=\d{,4}#a")),
             follow=True,
             callback='parse_item')
    ]

    def parse_item(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = TencentItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css(
                'tr > td:nth-child(2)::text').extract()[0]
            item['workLocation'] = site.css(
                'tr > td:nth-child(4)::text').extract()[0]
            item['recruitNumber'] = site.css(
                'tr > td:nth-child(3)::text').extract()[0]
            item['publishTime'] = site.css(
                'tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
            #print repr(item).decode("unicode-escape") + '\n'

        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
            item = TencentItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css(
                'tr > td:nth-child(2)::text').extract()[0]
            item['workLocation'] = site.css(
                'tr > td:nth-child(4)::text').extract()[0]
            item['recruitNumber'] = site.css(
                'tr > td:nth-child(3)::text').extract()[0]
            item['publishTime'] = site.css(
                'tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
            #print repr(item).decode("unicode-escape") + '\n'

        info('parsed ' + str(response))
        return items

    def _process_request(self, request):
        info('process ' + str(request))
        return request
Ejemplo n.º 22
0
class DoubanBookSpider(CrawlSpider):
    name = "douban_book"
    allowed_domains = ["douban.com"]
    start_urls = ["http://book.douban.com/tag/"]

    rules = (
        Rule(sle(allow=("/tag/[^/]+/?$", )), callback="parse_1"),
        Rule(sle(allow=("/tag/$", )),
             follow=True,
             process_request='_process_request'),
    )

    # NOTE: depth index is hidden.
    depth_class_list = [
        '.*/tag/?$',
        '.*/tag/.+/?',
    ]

    def _cal_depth(self, response):
        """
        Calculate the depth of response, and call corresponding method or stop
        crawl.
        """
        url = response.url
        for depth, depth_regexp in enumerate(self.depth_class_list):
            if re.match(depth_regexp, url):
                return depth
        # warn("Unknown url depth: " + url)
        # If the url pattern is unknown, then return -1.
        return -1

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request

    '''
Ejemplo n.º 23
0
class tencentDemoSpider(CrawlSpider):
    name = "tencent"
    allowed_domains = ["tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php"
    ]
    #Todo:summary CrawlSpider 用默认的parse 解析start_urls .本例子中把parse和parse_item合并一起了。
    # 对第一级start_urls的链接用rules的规则进行分析 还有一个方法是,不写rule,直接重写parse方法,方法里分别获取产品url和下一页url,也很好用
    rules = [  # 定义爬取URL的规则
               Rule(sle(allow=("/position\.php\?&start=\d{,4}#a")), follow=True, callback='parse')
            ]
    # parse yield返回给pipeline的必须是item,字典,或Request 。
    def parse(self, response):  # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = TencentdemoItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
            # items.append(item)
            yield item
            # print repr(item).decode("unicode-escape") + '\n'

        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
            item = TencentdemoItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()

            yield item
            # items.append(item)
            # print repr(item).decode("unicode-escape") + '\n'

        # info('parsed ' + str(response))

        #Todo:summary 下一页继续
        urls = sel.xpath('//*[@id="next"]/@href').extract()
        for url in urls:
            print url
            yield Request(urljoin_rfc(base_url, url), callback=self.parse)
Ejemplo n.º 24
0
class vipSpider(CrawlSpider):
    name = 'vip'

    allowed_domains = ['m.vip.com']

    start_urls = ['http://m.vip.com']

    rules = [
        Rule(sle(allow=(u'index\.php.*')), follow=True),
        Rule(sle(allow=('m.vip.com/product.*')), callback='parse_vip'),
        Rule(sle(allow=('m.vip.com/brand.*')), follow=True),
        # Rule(sle(allow=(u'product.*')),callback='parse_vip')
    ]

    def parse_vip(self, response):
        items = []

        sel = Selector(response)

        sites = sel.xpath('/html')

        for site in sites:
            item = VipItem()

            item['product_url'] = response.url
            item['image_url'] = site.xpath(
                '//li[@style="width: 224px; display: table-cell; vertical-align: top;"]//img[@src][1]'
            ).re(r'src=(.*?) data')
            item['price'] = site.xpath(
                '//span[@class="u-detail-price"]/text()').extract()
            item['name'] = site.xpath('//h1/text()').extract()

            # item['brand']=
            # item['location']=
            # item['material']=

            items.append(item)

        return items
Ejemplo n.º 25
0
class sisSpider(CrawlSpider):
    name = "sis"
    ip = "38.103.161.147"
    allowed_domains = [ip]
    ip_format = 'http://' + ip + '/forum/forum-%d-1.html'
    start_urls = [ip_format % d for d in [143, 230]]
    rules = [
        Rule(sle(allow=("/forum/thread-\d*-1-1\.html")), callback='parse_2'),
        Rule(sle(allow=("/forum/forum-\d*-1\.html")),
             follow=True,
             callback='parse_1'),
    ]

    def parse_2(self, response):
        items = []
        sel = Selector(response)
        sites = sel.css('.postcontent')[0:1]
        for site in sites:
            item = SisItem()
            item['title'] = site.css('.postmessage h2::text').extract()
            item['imgs'] = site.css('.postmessage img::attr(src)').extract()
            item['torrents'] = site.css(
                '.t_attachlist a[href*=attachment]').extract()
            # item['duty'] = site.css('.c .l2::text').extract()
            item['link'] = response.url
            items.append(item)
            print repr(item).decode("unicode-escape") + '\n'
        # info('parsed ' + str(response))
        self.parse_1(response)
        return items

    def parse_1(self, response):
        # url cannot encode to Chinese easily.. XXX
        info('parsed ' + str(response))

    def _process_request(self, request):
        info('process ' + str(request))
        return request
Ejemplo n.º 26
0
class BaiduEncySpider(CrawlSpider):
    name = 'baiduEncy'
    allowed_domains = ['baike.baidu.com']
    start_urls = [
        'http://baike.baidu.com/wenhua',
        'http://baike.baidu.com/dili',
        'http://baike.baidu.com/shenghuo',
    ]
    rules = [
        Rule(sle(allow=("/view/\d+.htm$")), callback='parse1'),
        Rule(sle(allow=("/view/\d+$/\d+.htm$")), callback='parse1'),
        Rule(sle(allow=("/\w+$", )), follow=True),
    ]

    def parse1(self, response):
        item = BaiduencyclopediaItem()
        item['id'] = response.url.split('/')[-1].split('.')[0]
        item['name'] = response.xpath('//span[@class="lemmaTitleH1"]/text()').extract()
        summary = response.xpath('//div[@class="card-summary-content"]').extract()
        item['summaryText'] = re.compile('<[^>]*>').sub('', summary[0])
        rawAttrs = response.css('.biItemInner')
        attr = {}
        for rAttr in rawAttrs:
            attrName = ''.join(rAttr.css('.biTitle ::text').extract())
            attrValue = ''.join(rAttr.css('.biContent ::text').extract())
            attr[attrName] = attrValue
        item['attr'] = attr
        rawLables = response.xpath('//sapn[@class="taglist"]/text()')
        lable = []
        for rLable in rawLables:
            lab = rLable.extract()
            lable.append(lab)
        item['lable'] = lable
        return [item, ]
        
    def _process_request(self, request):
        info('process ' + str(request))
        return request
Ejemplo n.º 27
0
class ClassName(CrawlSpider):
    """docstring for ClassName"""
    name = "bdhub"
    allowed_domains = ["ibmbigdatahub.com"]
    start_urls = ["http://www.ibmbigdatahub.com/blogs"]
    rules = [
        Rule(sle(allow=("/blogs\?page=\d{,4}")),
             follow=True,
             callback='parse_item')
    ]

    def isempty(self, var):
        if len(var):
            return var[0]
        else:
            return None

    def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        siteviews = sel.css('div.view-content div.views-row')
        for view in siteviews:
            item = DbhubItem()
            view = view.css('div.node__teaser')
            item['author'] = self.isempty(
                view.css('div.node__attributes span.blogger__name').xpath(
                    'a/text()').extract())
            item['authorTitle'] = self.isempty(
                view.css('div.node__attributes span.blogger__title-and-company'
                         ).xpath('text()').extract())
            item['imageUrl'] = self.isempty(
                view.css('div.blog__image a').xpath('img/@src').extract())
            item['title'] = self.isempty(
                view.css('h2').xpath('a/text()').extract())
            item['date'] = self.isempty(
                view.css('div.node__attributes span.blog__created-date').xpath(
                    'text()').extract())
            item['summery'] = self.isempty(
                view.css('div.blog__summary').xpath('text()').extract())
            relative_url = self.isempty(
                view.css('h2').xpath('a/@href').extract())
            item['link'] = urljoin_rfc(base_url, relative_url)
            item['baseUrl'] = response.url
            items.append(item)
        info("parsed " + str(response))
        return items

    def parse_start_url(self, response):
        return self.parse_item(response)
Ejemplo n.º 28
0
class E21jobSpider(CrawlSpider):
    name = "e21job"
    allowed_domains = ["job.e21.edu.cn"]
    start_urls = [
        "http://job.e21.edu.cn/stu_more.php?page=0&fenye=yes"
    ]
    rules = [
        Rule(sle(allow=("stu_more.php\?page=\d{,5}&fenye=yes")), follow=True, callback='parse_item')
    ]

    def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        #sites = sel.css('table:nth-child(5)').css('table:nth-child(2)').css('table.black12').css('tr')
        for i in range(1, 50, 2):
            item = GraduateItem()
            query = 'tr:nth-child(%d)' %i
            #print query
            site = sel.css('table:nth-child(5)').css('table:nth-child(2)').css('table.black12').css(query)
            array = site.css('a').xpath('text()').extract()
            if len(array) == 1 :
                item['name'] = array[0]
            array = site.css('a').xpath('@href').extract()
            if len(array) == 1 :
                relative_url = array[0]
                item['detailLink'] = urljoin_rfc(base_url, relative_url)
            array = site.css('td:nth-child(2)::text').extract()
            if len(array) == 1 :
                item['school'] = array[0]
            array = site.css('td:nth-child(3)::text').extract()
            if len(array) == 1 :
                item['specialty'] = array[0]
            array = site.css('td:nth-child(4)::text').extract()
            if len(array) == 1 :
                item['education'] = array[0]
            items.append(item)
            #print repr(item).decode("unicode-escape") + '\n'

        info('parsed ' + str(response))
        #log.msg(str('parsed ' + str(response)), level=log.INFO)
        return items


    def _process_request(self, request):
        #print request
        info('process ' + str(request))
        # log.msg(str('process ' + str(request)), level=log.INFO)
        return request
Ejemplo n.º 29
0
class MySpider(CrawlSpider):
    name = "yaohao"
    allowed_domains = ["bjhjyd.gov.cn"]
    start_urls = [
        "http://www.bjhjyd.gov.cn/",
    ]
    rules = [ # 定义爬取URL的规则
              # 在起始页面的下一级进行查找
        Rule(sle(allow=("https://www\.taobao\.com/market/.+"))),
        Rule(sle(allow=("item\.taobao\.com/item\.htm\?spm=.+")), follow=True, callback='parse_item')
    ]

    def parse_start_url(self, response):
        print response.url
        sel = Selector(response)
        print "+++++++++++++++++++++++"
        print sel.xpath('//*[@id="getValidCode"]').extract()
        return None

    def parse_item(self, response):
        items = []
    # // filename = response.url.split("/")[-2]
        print "taobao =========================%s"%response.url
        return items
Ejemplo n.º 30
0
class templateSpider(CommonSpider):
    name = "template"
    allowed_domains = ["template.com"]
    start_urls = [
        "http://www.template.com/",
    ]
    rules = [
        Rule(sle(allow=(
            "/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")),
             callback='parse',
             follow=True),
    ]

    def parse(self, response):
        info('Parse ' + response.url)
Ejemplo n.º 31
0
 def __init__(self, forum_id=58, digit=1, *args, **kwargs):
     self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
     self.rules = [Rule(sle(allow=("/forum/forum-" + forum_id + "-[0-9]{," + digit + "}\.html")), follow=True,
                        callback='parse_1'), ]
     super(sisSpider, self).__init__(*args, **kwargs)