Ejemplo n.º 1
0
 def parse_detail(self, response):
     tczufangItem = TcZufangItem()
     response_selector = Selector(response)
     # 字段的提取可以使用在终端上scrapy shell进行调试使用
     # 帖子名称
     raw_title = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-title")]/h1[contains(@class,"c_333 f20")]/text()'
         ).extract())
     if raw_title:
         tczufangItem['title'] = raw_title.encode('utf8')
     #t帖子发布时间,进一步处理
     raw_time = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-title")]/p[contains(@class,"house-update-info c_888 f12")]/text()'
         ).extract())
     tczufangItem['pub_time'] = re.findall(r'\d+\-\d+\-\d+\s+\d+\:\d+\:\d+',
                                           raw_time)[0]
     #租金
     tczufangItem['money'] = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-pay-way f16")]/span[contains(@class,"c_ff552e")]/b[contains(@class,"f36")]/text()'
         ).extract())
     # 租赁方式
     raw_method = list_first_item(
         response_selector.xpath(
             u'//ul[contains(@class,"f14")]/li[1]/span[2]/text()').extract(
             ))
     tczufangItem['method'] = raw_method.encode('utf8')
     # 所在区域
     area = response_selector.xpath(
         u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
     ).extract()[1]
     if area:
         area = area
     raw_area = area + "-" + response_selector.xpath(
         u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
     ).extract()[2]
     if raw_area:
         raw_area = raw_area.encode('utf8')
     tczufangItem['area'] = raw_area if raw_area else None
     # 所在小区
     raw_community = response_selector.xpath(
         u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
     ).extract()[0]
     if raw_community:
         raw_community = raw_community.encode('utf8')
     tczufangItem['community'] = raw_community if raw_community else None
     # 帖子详情url
     tczufangItem['targeturl'] = response.url
     #帖子所在城市
     tczufangItem['city'] = response.url.split("//")[1].split('.')[0]
     yield tczufangItem
Ejemplo n.º 2
0
 def parse(self, response):
     #获取所访问的地址
     response_url = re.findall('^http\:\/\/\w+\.58\.com', response.url)
     response_selector = Selector(response)
     ress = '【东莞分类信息】 东莞免费发布信息网 - 东莞58同城'
     recog = response_selector.xpath(
         u'/html/head/title/text()').extract()[0].encode('utf8')
     if recog == ress:
         pass
     else:
         next_link = list_first_item(
             response_selector.xpath(
                 u'//div[contains(@class,"pager")]/a[contains(@class,"next")]/@href'
             ).extract())
         if next_link:
             # print next_link
             yield Request(url=next_link, callback=self.parse)
         for detail_link in response_selector.xpath(
                 u'//div[contains(@class,"listBox")]/ul[contains(@class,"listUl")]/li/@logr'
         ).extract():
             #gz_2_39755299868183_28191154595392_sortid:1486483205000 @ ses:busitime ^ desc @ pubid:5453707因为58同城的详情页做了爬取限制,所以由自己构造详情页id
             #构造详情页url
             # detail_link='http://dg.58.com/zufang/'+detail_link.split('_')[3]+'x.shtml'
             detail_link = response_url[0] + '/zufang/' + detail_link.split(
                 '_')[3] + 'x.shtml'
             #对详情页进行解析
             if detail_link:
                 yield Request(url=detail_link, callback=self.parse_detail)
Ejemplo n.º 3
0
class TczufangSpider(RedisSpider):
    name='basic'
    start_urls=(
        'http://dg.58.com/chuzu/',
        'http://sw.58.com/chuzu/',
        'http://sz.58.com/chuzu/',
        'http://gz.58.com/chuzu/',
    #     'http://fs.58.com/chuzu/',
    #     'http://zs.58.com/chuzu/',
    #     'http://zh.58.com/chuzu/',
    #     'http://huizhou.58.com/chuzu/',
    #     'http://jm.58.com/chuzu/',
    #     'http://st.58.com/chuzu/',
    #     'http://zhanjiang.58.com/chuzu/',
    #     'http://zq.58.com/chuzu/',
    #     'http://mm.58.com/chuzu/',
    #     'http://jy.58.com/chuzu/',
    #     'http://mz.58.com/chuzu/',
    #     'http://qingyuan.58.com/chuzu/',
    #     'http://yj.58.com/chuzu/',
    #     'http://sg.58.com/chuzu/',
    #     'http://heyuan.58.com/chuzu/',
    #     'http://yf.58.com/chuzu/',
    #     'http://chaozhou.58.com/chuzu/',
    #     'http://taishan.58.com/chuzu/',
    #     'http://yangchun.58.com/chuzu/',
    #     'http://sd.58.com/chuzu/',
    #     'http://huidong.58.com/chuzu/',
    #     'http:// boluo.58.com/chuzu/',
    # )
    # redis_key = 'tczufangCrawler:start_urls'
    #解析从start_urls下载返回的页面
    #页面页面有两个目的:
    #第一个:解析获取下一页的地址,将下一页的地址传递给爬虫调度器,以便作为爬虫的下一次请求
    #第二个:获取详情页地址,再对详情页进行下一步的解析
    redis_key = 'start_urls'
    def parse(self, response):
        #获取所访问的地址
        response_url=re.findall('^http\:\/\/\w+\.58\.com',response.url)
        response_selector = Selector(response)
        next_link=list_first_item(response_selector.xpath(u'//div[contains(@class,"pager")]/a[contains(@class,"next")]/@href').extract())
        detail_link=response_selector.xpath(u'//div[contains(@class,"listBox")]/ul[contains(@class,"listUl")]/li/@logr').extract()

        if next_link:
            if detail_link:
                    # print next_link
                # yield Request(next_link,callback=self.parse)
                inserintotc(next_link, 1)
                print '#########[success] the next link ' + next_link + ' is insert into the redis queue#########'
        for detail_link in response_selector.xpath(u'//div[contains(@class,"listBox")]/ul[contains(@class,"listUl")]/li/@logr').extract():
             #gz_2_39755299868183_28191154595392_sortid:1486483205000 @ ses:busitime ^ desc @ pubid:5453707因为58同城的详情页做了爬取限制,所以由自己构造详情页id
             #构造详情页url
               # detail_link='http://dg.58.com/zufang/'+detail_link.split('_')[3]+'x.shtml'
            detail_link = response_url[0]+'/zufang/' + detail_link.split('_')[3] + 'x.shtml'
               #对详情页进行解析cd
            if detail_link:
                inserintota(detail_link,2)
                print '[success] the detail link ' + detail_link + ' is insert into the redis queue'
Ejemplo n.º 4
0
    def parse(self, response):
        #获取所访问的地址
        response_url=re.findall('^http\:\/\/\w+\.58\.com',response.url)
        response_selector = Selector(response)
        next_link=list_first_item(response_selector.xpath(u'//div[contains(@class,"pager")]/a[contains(@class,"next")]/@href').extract())
        detail_link=response_selector.xpath(u'//div[contains(@class,"listBox")]/ul[contains(@class,"listUl")]/li/@logr').extract()

        if next_link:
            if detail_link:
                    # print next_link
                # yield Request(next_link,callback=self.parse)
                inserintotc(next_link, 1)
                print '#########[success] the next link ' + next_link + ' is insert into the redis queue#########'
        for detail_link in response_selector.xpath(u'//div[contains(@class,"listBox")]/ul[contains(@class,"listUl")]/li/@logr').extract():
             #gz_2_39755299868183_28191154595392_sortid:1486483205000 @ ses:busitime ^ desc @ pubid:5453707因为58同城的详情页做了爬取限制,所以由自己构造详情页id
             #构造详情页url
               # detail_link='http://dg.58.com/zufang/'+detail_link.split('_')[3]+'x.shtml'
            detail_link = response_url[0]+'/zufang/' + detail_link.split('_')[3] + 'x.shtml'
               #对详情页进行解析cd
            if detail_link:
                inserintota(detail_link,2)
                print '[success] the detail link ' + detail_link + ' is insert into the redis queue'
Ejemplo n.º 5
0
    def parse_detail(self, response):
        tczufangItem = TcZufangItem()
        response_selector = Selector(response)
        # 字段的提取可以使用在终端上scrapy shell进行调试使用
        # 帖子名称
        raw_title = list_first_item(
            response_selector.xpath(
                u'//div[contains(@class,"card-top")]/p[contains(@class,"card-title")]/i/text()'
            ).extract())
        tczufangItem['title'] = raw_title
        # #t帖子发布时间,进一步处理
        tczufangItem['pub_time'] = None
        #租金
        money = list_first_item(
            response_selector.xpath(
                u'//div[contains(@class,"card-top")]/ul[contains(@class,"card-pay f-clear")]/li[contains(@class,"price")]/span[contains(@class,"num")]/text()'
            ).extract())
        if money:
            money = money.encode('utf8')
        tczufangItem['money'] = money
        # 租赁方式
        raw_method = list_first_item(
            response_selector.xpath(
                u'//div[contains(@class,"fang-info")]/span[2]/text()').extract(
                ))
        if raw_method:
            raw_method = raw_method.encode("utf8")
        tczufangItem['method'] = raw_method
        # 所在区域
        area = response_selector.xpath(
            u'//div[contains(@class,"card-item f-clear")][2]/p/span/a[1]/text()'
        ).extract()[0]
        area += '-' + response_selector.xpath(
            u'//div[contains(@class,"card-item f-clear")]/p/span/a[2]/text()'
        ).extract()[0]
        area1 = response_selector.xpath(
            u'//div[contains(@class,"card-item f-clear")]/p/span/a[3]/text()'
        ).extract()
        if area1:
            area = area + '-' + area1[0]
        if area:
            area = area.encode("utf8")
        tczufangItem['area'] = area
        # 所在小区
        raw_community = list_first_item(
            response_selector.xpath(
                u'//div[contains(@class,"card-item f-clear")]/p/span/text()').
            extract())
        if raw_community.strip():
            raw_community = raw_community.strip().encode('utf8')
            tczufangItem['community'] = raw_community
        else:
            raw_community = list_first_item(
                response_selector.xpath(
                    u'//div[contains(@class,"card-item f-clear")]/p/span[1]/a/text()'
                ).extract())
            tczufangItem['community'] = raw_community

        # 帖子详情url
        tczufangItem['targeturl'] = response.url
        # #帖子所在城市
        tczufangItem['city'] = response.url.split("//")[1].split('.')[0]
        yield tczufangItem
Ejemplo n.º 6
0
 def parse(self, response):
     tczufangItem = TcZufangItem()
     response_url = re.findall('^http\:\/\/\w+\.58\.com', response.url)
     response_selector = Selector(response)
     # 字段的提取可以使用在终端上scrapy shell进行调试使用
     # 帖子名称
     raw_title = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-title")]/h1[contains(@class,"c_333 f20")]/text()'
         ).extract())
     if raw_title:
         tczufangItem['title'] = raw_title.encode('utf8')
     #t帖子发布时间,进一步处理
     raw_time = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-title")]/p[contains(@class,"house-update-info c_888 f12")]/text()'
         ).extract())
     try:
         tczufangItem['pub_time'] = re.findall(
             r'\d+\-\d+\-\d+\s+\d+\:\d+\:\d+', raw_time)[0]
     except:
         tczufangItem['pub_time'] = 0
     #租金
     tczufangItem['money'] = list_first_item(
         response_selector.xpath(
             u'//div[contains(@class,"house-pay-way f16")]/span[contains(@class,"c_ff552e")]/b[contains(@class,"f36")]/text()'
         ).extract())
     # 租赁方式
     raw_method = list_first_item(
         response_selector.xpath(
             u'//ul[contains(@class,"f14")]/li[1]/span[2]/text()').extract(
             ))
     try:
         tczufangItem['method'] = raw_method.encode('utf8')
     except:
         tczufangItem['method'] = 0
     # 所在区域
     try:
         area = response_selector.xpath(
             u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
         ).extract()[1]
     except:
         area = ''
     if area:
         area = area
     try:
         area2 = response_selector.xpath(
             u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
         ).extract()[2]
     except:
         area2 = ''
     raw_area = area + "-" + area2
     if raw_area:
         raw_area = raw_area.encode('utf8')
     tczufangItem['area'] = raw_area if raw_area else None
     # 所在小区
     try:
         raw_community = response_selector.xpath(
             u'//ul[contains(@class,"f14")]/li/span/a[contains(@class,"c_333")]/text()'
         ).extract()[0]
         if raw_community:
             raw_community = raw_community.encode('utf8')
         tczufangItem[
             'community'] = raw_community if raw_community else None
     except:
         tczufangItem['community'] = 0
     # 帖子详情url
     tczufangItem['targeturl'] = response.url
     #帖子所在城市
     tczufangItem['city'] = response.url.split("//")[1].split('.')[0]
     #帖子的联系电话
     try:
         tczufangItem['phone'] = response_selector.xpath(
             u'//div[contains(@class,"house-fraud-tip")]/span[1]/em[contains(@class,"phone-num")]/text()'
         ).extract()[0]
     except:
         tczufangItem['phone'] = 0
     # 图片1的联系电话
     try:
         tczufangItem['img1'] = response_selector.xpath(
             u'//ul[contains(@class,"pic-list-wrap pa")]/li[1]/@data-src'
         ).extract()[0]
     except:
         tczufangItem['img1'] = 0
     # 图片1的联系电话
     try:
         tczufangItem['img2'] = response_selector.xpath(
             u'//ul[contains(@class,"pic-list-wrap pa")]/li[2]/@data-src'
         ).extract()[0]
     except:
         tczufangItem['img2'] = 0
     yield tczufangItem