Example #1
0
class anjuke_spider(RedisSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    redis_key = 'anjuke_beijing_spider:page_url'

    allowed_domains = ["anjuke.com"]

    def parse(self, response):

        sel = Selector(response)
        try:
            cur_page = sel.xpath(
                '//div[@class="multi-page"]/i[@class="curr"]/text()').extract(
                )[0]
            links = SgmlLinkExtractor(
                allow=(r'http://.+.anjuke.com/prop/view/.+'),
                restrict_xpaths=('//ul[@id="houselist-mod"]'),
                unique=0).extract_links(response)
            r = Redis()
            for link in links:
                try:
                    r.lpush('anjuke_beijing_spider:data_url', link.url)
                except Exception as e:
                    print Exception, ":", e
        except Exception as e:
            print Exception, ":", e
class anjuke_spider(RedisSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    redis_key = 'anjuke_spider:page_url'

    allowed_domains = ["anjuke.com"]

    def parse(self, response):

        sel = Selector(response)
        try:
            cur_page = sel.xpath(
                '//div[@class="multi-page"]/i[@class="curr"]/text()').extract(
                )[0]
            links = SgmlLinkExtractor(
                allow=(r'http://.+.anjuke.com/prop/view/.+'),
                restrict_xpaths=('//ul[@id="houselist-mod"]'),
                unique=0).extract_links(response)
            r = Redis()
            for link in links:
                try:
                    anjuke_id = (re.search(r'A\d+',
                                           link.url)).group(0).replace(
                                               'A', '')
                    filte_flag = r.sismember('anjuke_gz:history', anjuke_id)
                    if not filte_flag:
                        r.sadd('anjuke_spider:data_url', link.url)
                        r.sadd('anjuke_gz:history', anjuke_id)
                except Exception as e:
                    print Exception, ":", e
        except Exception as e:
            print Exception, ":", e
class anjuke_spider(RedisSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    redis_key = 'anjuke_beijing_spider:start_urls'

    allowed_domains = ["anjuke.com"]

    def parse(self, response):

        url_format = 'http://beijing.anjuke.com/sale/'
        upper_bound = 1000
        division = 60
        step = upper_bound / division
        for section_index in range(division):

            spider_upper_bound = (step) * (section_index + 1)
            spider_lower_bound = (step) * (section_index)
            if spider_lower_bound >= 999:
                spider_upper_bound = 99999
            print spider_upper_bound, spider_lower_bound
            try:
                r = Redis()
                for page_index in range(1, 51):
                    url = url_format + ('p%s/' % page_index) + (
                        '?to_price=%s&from_price=%s' %
                        (spider_upper_bound, spider_lower_bound))
                    print url
                    r.lpush('anjuke_beijing_spider:page_url', url)
                #r.lpush('anjuke_spider:start_urls','http://guangzhou.anjuke.com/sale/')
            except Exception as e:
                print Exception, ":", e
class anjuke_spider(RedisSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    redis_key = 'anjuke_spider:start_urls'
    custom_settings = {
        'EXTENSIONS': {
            'scrapy.telnet.TelnetConsole': None,
            'anjuke.auto_lpush_start.auto_lpush': 400
        },
    }
    allowed_domains = ["anjuke.com"]

    def parse(self, response):

        url_format = 'http://guangzhou.anjuke.com/sale/'
        upper_bound = 1000
        division = 500
        step = upper_bound / division
        for section_index in range(division):

            spider_upper_bound = (step) * (section_index + 1)
            spider_lower_bound = (step) * (section_index)
            if spider_lower_bound >= 999:
                spider_upper_bound = 99999
            print spider_upper_bound, spider_lower_bound
            try:
                r = Redis()
                for page_index in range(1, 51):
                    url = url_format + ('o5-p%s/' % page_index) + (
                        '?to_price=%s&from_price=%s' %
                        (spider_upper_bound, spider_lower_bound))
                    print url
                    r.sadd('anjuke_spider:page_url', url)
                #r.lpush('anjuke_spider:start_urls','http://guangzhou.anjuke.com/sale/')
            except Exception as e:
                print Exception, ":", e
class anjuke_spider(RedisSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    redis_key = 'anjuke_spider:data_url'

    allowed_domains = ["anjuke.com"]

    def _item_init(self, item):

        item['anjuke_id'] = ''
        item['deploy_time'] = ''
        item['Cur_url'] = ''
        item['City'] = ''
        item['District'] = ''
        item['Block'] = ''
        item['Estate'] = ''
        item['Title'] = ''
        item['Price'] = ''
        item['Layout'] = ''
        item['Decoration'] = ''
        item['Location'] = ''
        item['Area'] = ''
        item['Unit_Price'] = ''
        item['Years'] = ''
        item['Orientation'] = ''
        item['Downpayment'] = ''
        item['Type'] = ''
        item['Floor'] = ''
        item['Monthly_Payments'] = ''
        item['Desc'] = ''
        item['Agent'] = ''
        item['Agent_Phone'] = ''
        item['Agent_Company'] = ''

        return item

    def parse(self, response):

        sel = Selector(response)
        item = AnjukeItem()
        item = self._item_init(item)

        try:
            hourse_info = sel.xpath(
                '//h4[@class="block-title houseInfo-title"]/span/text()'
            ).extract()[0]
            item['anjuke_id'] = (re.search(r"\d{9,}", hourse_info)).group(0)
            item['deploy_time'] = (re.search(
                r"\d{4}%s\d{2}%s\d{2}%s" %
                ('年'.decode("utf-8"), '月'.decode("utf-8"),
                 '日'.decode("utf-8")), hourse_info)).group(0)
        except Exception as e:
            print Exception, ":", e
        try:
            item['Cur_url'] = response.url
        except Exception as e:
            print Exception, ":", e
        try:
            item['City'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[2]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['District'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[3]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Block'] = (sel.xpath(
                '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]).replace(
                    '二手房'.decode("utf8"), '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Estate'] = sel.xpath(
                '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Title'] = sel.xpath(
                '//*[@id="content"]/div[@class="wrapper"]/h3[@class="long-title"]/text()'
            ).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Price'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath('//*[@id="content"]/div[2]/div[1]/div[1]/span[1]').
                extract()[0])
        except Exception as e:
            print Exception, ":", e
        try:
            item['Layout'] = (sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '房型:'.decode("utf8")).extract()[0]).replace('\n',
                                                              '').replace(
                                                                  '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Decoration'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '装修程度:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Location'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath(
                    '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/p'
                    % '位置:'.decode("utf8")).extract()[0]).replace('\n',
                                                                  '').replace(
                                                                      '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Area'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '面积:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Unit_Price'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '房屋单价:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Years'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '年代:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Orientation'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '朝向:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Downpayment'] = (sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '参考首付:'.decode("utf8")).extract()[0]).replace('\n',
                                                                '').replace(
                                                                    '\t', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Type'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '类型:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Floor'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()'
                % '楼层:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Monthly_Payments'] = sel.xpath(
                '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/span/text()'
                % '参考月供:'.decode("utf8")).extract()[0]
        except Exception as e:
            print Exception, ":", e
        try:
            item['Desc'] = (re.compile(r'<[^>]+>', re.S)).sub(
                '',
                sel.xpath(
                    '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[3]/div/div'
                ).extract()[0])
        except Exception as e:
            print Exception, ":", e
        try:
            item['Agent'] = sel.xpath(
                '//p[@class="broker-name"]/a/text()').extract()[0]
        except Exception as e:
            print Exception, ":",
        try:
            item['Agent_Phone'] = (sel.xpath(
                '//p[@class="broker-mobile"]/text()').extract()[0]).replace(
                    ' ', '')
        except Exception as e:
            print Exception, ":", e
        try:
            item['Agent_Company'] = sel.xpath(
                '//div[@class="broker-company"]/a[1]/text()').extract()[0]
        except Exception as e:
            print Exception, ":", e
        yield item
class anjuke_spider(CrawlSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    batch_id = 0
    start_urls = []
    r = Redis(host="192.168.10.39")

    allowed_domains = ["fang.com"]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle,
                                       signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        for req in self.start_requests():
            self.crawler.engine.crawl(req, spider=self)
        raise DontCloseSpider

    def start_requests(self):
        url_format = 'http://esf.fang.com/chushou/3_%s.htm'
        try:
            for id_code in self.r.zrevrange('house:zset', 0, 1000):
                url = url_format % (id_code)
                print url
                yield Request(url=url,
                              method='GET',
                              callback=self.parse,
                              dont_filter=True)
        except Exception as e:
            print Exception, ":", e

    def _item_init(self, item):
        item['fang_id'] = ''
        item['url'] = ''
        item['body'] = ''
        return item

    def parse(self, response):

        sel = Selector(response)
        item = AnjukeItem()
        item = self._item_init(item)
        try:
            fang_info = {'title': '', 'info': '', 'desc': '', 'pic_tab': ''}
            url = item['url'] = response.url
            fang_id = item['fang_id'] = (re.search(r'\d+_\d+', url)).group(0)
            item['body'] = (response.body).decode('gbk').encode('utf8')

            try:
                fang_info['title'] = sel.xpath(
                    '//div[@class="mainBoxL"]/div[@class="title"]').extract(
                    )[0]
            except Exception as e:
                print Exception, ":", e
            try:
                fang_info['info'] = sel.xpath(
                    '//div[@class="houseInfor clearfix"]/div[@class="inforTxt"]'
                ).extract()[0]
            except Exception as e:
                print Exception, ":", e

            try:
                fang_info['desc'] = sel.xpath(
                    '//div[@id="hsPro-pos"]/div[@class="describe mt10"]'
                ).extract()[0]
            except Exception as e:
                print Exception, ":", e

            try:
                fang_info['pic_tab'] = sel.xpath(
                    '//div[@id="hsPic-pos"]').extract()[0]
            except Exception as e:
                print Exception, ":", e

            m = hashlib.md5()
            m.update(str(fang_info))
            follow_value = m.hexdigest()
            yield item

        except Exception as e:
            print Exception, ":", e
Example #7
0
class anjuke_spider(CrawlSpider):

    spider_id = global_spider.get_spider_id()
    global_spider.spider_id_add()
    name = "anjuke_spider%s" % spider_id
    batch_id = 0
    start_urls = []

    allowed_domains = ["fang.com"]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle,
                                       signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        for req in self.start_requests():
            self.crawler.engine.crawl(req, spider=self)
        raise DontCloseSpider

    def start_requests(self):
        url_format = 'http://esf.fang.com/house/'
        try:
            for page_index in range(1, 31):
                url = url_format + 'i3%s/' % (page_index)
                yield Request(url=url,
                              method='GET',
                              callback=self.parse,
                              dont_filter=True,
                              meta={
                                  'submit_time':
                                  str(
                                      time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())),
                                  'page_index':
                                  page_index,
                                  'batch_id':
                                  self.batch_id
                              })
            self.batch_id += 1
        except Exception as e:
            print Exception, ":", e

    def _item_init(self, item):
        item['fang_id'] = ''
        item['batch_id'] = ''
        item['submit_time'] = ''
        item['schedule_time'] = ''
        item['received_time'] = ''
        item['page_index'] = ''
        item['rank'] = ''
        item['update_tag'] = ''
        item['update_time'] = ''
        item['server_time'] = ''
        return item

    def parse(self, response):

        item = AnjukeItem()
        item = self._item_init(item)
        sel = Selector(response)
        item['batch_id'] = batch_id = response.meta['batch_id']
        item['submit_time'] = submit_time = response.meta['submit_time']
        item['schedule_time'] = schedule_time = str(
            time.strftime("%Y-%m-%d %H:%M:%S",
                          time.localtime(response.meta['schedule_time'])))
        item['received_time'] = received_time = str(
            time.strftime("%Y-%m-%d %H:%M:%S",
                          time.localtime(response.meta['received_time'])))
        item['page_index'] = page_index = response.meta['page_index']
        server_time = time.mktime(
            time.strptime(response.headers['Date'],
                          "%a, %d %b %Y %H:%M:%S %Z")) + 8 * 3600
        item['server_time'] = str(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(server_time)))
        try:
            if sel.xpath('//div[@class="list sorry_word"]') != []:
                if 'retry_count' in response.meta:
                    retry_count = int(response.meta['retry_count'])
                else:
                    retry_count = 0
                if retry_count <= 2:
                    print "retry......"
                    yield Request(url=response.url,
                                  method='GET',
                                  callback=self.parse,
                                  meta={
                                      'submit_time': submit_time,
                                      'schedule_time': schedule_time,
                                      'received_time': received_time,
                                      'retry_count': retry_count + 1,
                                      'page_index': page_index,
                                      'batch_id': batch_id
                                  })
                else:
                    return
            if len(sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')) > 30:
                dl_list = sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')
                for dl_index in range(1, len(dl_list)):
                    try:
                        item['fang_id'] = fang_id = (re.search(
                            r'\d_\d+', (dl_list[dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="title"]/a/@href'
                            ).extract()[0]))).group(0)
                        item['rank'] = rank = 30 * (page_index - 1) + dl_index
                        item['update_tag'] = update_tag = dl_list[
                            dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()'
                            ).extract()[0]
                        if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('秒前更新'.decode("utf-8"), ''))
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('分钟前更新'.decode("utf-8"),
                                                   '')) * 60
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('小时前更新'.decode("utf-8"),
                                                   '')) * 3600
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+天前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('天前更新'.decode("utf-8"),
                                                   '')) * 3600 * 24
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                    except Exception as e:
                        print Exception, ":", e
                    yield item
            else:
                dl_list = sel.xpath(
                    '//div[@class="houseList"]/dl[@class="list rel"]')
                for dl_index in range(0, len(dl_list)):
                    try:
                        item['fang_id'] = fang_id = (re.search(
                            r'\d_\d+', (dl_list[dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="title"]/a/@href'
                            ).extract()[0]))).group(0)
                        item['rank'] = rank = 30 * (page_index -
                                                    1) + dl_index + 1
                        item['update_tag'] = update_tag = dl_list[
                            dl_index].xpath(
                                './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()'
                            ).extract()[0]
                        if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('秒前更新'.decode("utf-8"), ''))
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('分钟前更新'.decode("utf-8"),
                                                   '')) * 60
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('小时前更新'.decode("utf-8"),
                                                   '')) * 3600
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                        if re.match(r'\d+天前更新'.decode("utf-8"), update_tag):
                            deviation = int(
                                update_tag.replace('天前更新'.decode("utf-8"),
                                                   '')) * 3600 * 24
                            item['update_time'] = update_time = str(
                                time.strftime(
                                    "%Y-%m-%d %H:%M:%S",
                                    time.localtime(server_time - deviation)))
                    except Exception as e:
                        print Exception, ":", e
                    yield item
        except Exception as e:
            print Exception, ":", e