class anjuke_spider(RedisSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id redis_key = 'anjuke_beijing_spider:page_url' allowed_domains = ["anjuke.com"] def parse(self, response): sel = Selector(response) try: cur_page = sel.xpath( '//div[@class="multi-page"]/i[@class="curr"]/text()').extract( )[0] links = SgmlLinkExtractor( allow=(r'http://.+.anjuke.com/prop/view/.+'), restrict_xpaths=('//ul[@id="houselist-mod"]'), unique=0).extract_links(response) r = Redis() for link in links: try: r.lpush('anjuke_beijing_spider:data_url', link.url) except Exception as e: print Exception, ":", e except Exception as e: print Exception, ":", e
class anjuke_spider(RedisSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id redis_key = 'anjuke_spider:page_url' allowed_domains = ["anjuke.com"] def parse(self, response): sel = Selector(response) try: cur_page = sel.xpath( '//div[@class="multi-page"]/i[@class="curr"]/text()').extract( )[0] links = SgmlLinkExtractor( allow=(r'http://.+.anjuke.com/prop/view/.+'), restrict_xpaths=('//ul[@id="houselist-mod"]'), unique=0).extract_links(response) r = Redis() for link in links: try: anjuke_id = (re.search(r'A\d+', link.url)).group(0).replace( 'A', '') filte_flag = r.sismember('anjuke_gz:history', anjuke_id) if not filte_flag: r.sadd('anjuke_spider:data_url', link.url) r.sadd('anjuke_gz:history', anjuke_id) except Exception as e: print Exception, ":", e except Exception as e: print Exception, ":", e
class anjuke_spider(RedisSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id redis_key = 'anjuke_beijing_spider:start_urls' allowed_domains = ["anjuke.com"] def parse(self, response): url_format = 'http://beijing.anjuke.com/sale/' upper_bound = 1000 division = 60 step = upper_bound / division for section_index in range(division): spider_upper_bound = (step) * (section_index + 1) spider_lower_bound = (step) * (section_index) if spider_lower_bound >= 999: spider_upper_bound = 99999 print spider_upper_bound, spider_lower_bound try: r = Redis() for page_index in range(1, 51): url = url_format + ('p%s/' % page_index) + ( '?to_price=%s&from_price=%s' % (spider_upper_bound, spider_lower_bound)) print url r.lpush('anjuke_beijing_spider:page_url', url) #r.lpush('anjuke_spider:start_urls','http://guangzhou.anjuke.com/sale/') except Exception as e: print Exception, ":", e
class anjuke_spider(RedisSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id redis_key = 'anjuke_spider:start_urls' custom_settings = { 'EXTENSIONS': { 'scrapy.telnet.TelnetConsole': None, 'anjuke.auto_lpush_start.auto_lpush': 400 }, } allowed_domains = ["anjuke.com"] def parse(self, response): url_format = 'http://guangzhou.anjuke.com/sale/' upper_bound = 1000 division = 500 step = upper_bound / division for section_index in range(division): spider_upper_bound = (step) * (section_index + 1) spider_lower_bound = (step) * (section_index) if spider_lower_bound >= 999: spider_upper_bound = 99999 print spider_upper_bound, spider_lower_bound try: r = Redis() for page_index in range(1, 51): url = url_format + ('o5-p%s/' % page_index) + ( '?to_price=%s&from_price=%s' % (spider_upper_bound, spider_lower_bound)) print url r.sadd('anjuke_spider:page_url', url) #r.lpush('anjuke_spider:start_urls','http://guangzhou.anjuke.com/sale/') except Exception as e: print Exception, ":", e
class anjuke_spider(RedisSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id redis_key = 'anjuke_spider:data_url' allowed_domains = ["anjuke.com"] def _item_init(self, item): item['anjuke_id'] = '' item['deploy_time'] = '' item['Cur_url'] = '' item['City'] = '' item['District'] = '' item['Block'] = '' item['Estate'] = '' item['Title'] = '' item['Price'] = '' item['Layout'] = '' item['Decoration'] = '' item['Location'] = '' item['Area'] = '' item['Unit_Price'] = '' item['Years'] = '' item['Orientation'] = '' item['Downpayment'] = '' item['Type'] = '' item['Floor'] = '' item['Monthly_Payments'] = '' item['Desc'] = '' item['Agent'] = '' item['Agent_Phone'] = '' item['Agent_Company'] = '' return item def parse(self, response): sel = Selector(response) item = AnjukeItem() item = self._item_init(item) try: hourse_info = sel.xpath( '//h4[@class="block-title houseInfo-title"]/span/text()' ).extract()[0] item['anjuke_id'] = (re.search(r"\d{9,}", hourse_info)).group(0) item['deploy_time'] = (re.search( r"\d{4}%s\d{2}%s\d{2}%s" % ('年'.decode("utf-8"), '月'.decode("utf-8"), '日'.decode("utf-8")), hourse_info)).group(0) except Exception as e: print Exception, ":", e try: item['Cur_url'] = response.url except Exception as e: print Exception, ":", e try: item['City'] = (sel.xpath( '//*[@id="content"]/div[1]/a[2]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['District'] = (sel.xpath( '//*[@id="content"]/div[1]/a[3]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['Block'] = (sel.xpath( '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['Estate'] = sel.xpath( '//*[@id="content"]/div[1]/a[4]/text()').extract()[0] except Exception as e: print Exception, ":", e try: item['Title'] = sel.xpath( '//*[@id="content"]/div[@class="wrapper"]/h3[@class="long-title"]/text()' ).extract()[0] except Exception as e: print Exception, ":", e try: item['Price'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath('//*[@id="content"]/div[2]/div[1]/div[1]/span[1]'). extract()[0]) except Exception as e: print Exception, ":", e try: item['Layout'] = (sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '房型:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Decoration'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '装修程度:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Location'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/p' % '位置:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Area'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '面积:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Unit_Price'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '房屋单价:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Years'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '年代:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Orientation'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '朝向:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Downpayment'] = (sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '参考首付:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Type'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '类型:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Floor'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '楼层:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Monthly_Payments'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/span/text()' % '参考月供:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Desc'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[3]/div/div' ).extract()[0]) except Exception as e: print Exception, ":", e try: item['Agent'] = sel.xpath( '//p[@class="broker-name"]/a/text()').extract()[0] except Exception as e: print Exception, ":", try: item['Agent_Phone'] = (sel.xpath( '//p[@class="broker-mobile"]/text()').extract()[0]).replace( ' ', '') except Exception as e: print Exception, ":", e try: item['Agent_Company'] = sel.xpath( '//div[@class="broker-company"]/a[1]/text()').extract()[0] except Exception as e: print Exception, ":", e yield item
class anjuke_spider(CrawlSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id batch_id = 0 start_urls = [] r = Redis(host="192.168.10.39") allowed_domains = ["fang.com"] @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): for req in self.start_requests(): self.crawler.engine.crawl(req, spider=self) raise DontCloseSpider def start_requests(self): url_format = 'http://esf.fang.com/chushou/3_%s.htm' try: for id_code in self.r.zrevrange('house:zset', 0, 1000): url = url_format % (id_code) print url yield Request(url=url, method='GET', callback=self.parse, dont_filter=True) except Exception as e: print Exception, ":", e def _item_init(self, item): item['fang_id'] = '' item['url'] = '' item['body'] = '' return item def parse(self, response): sel = Selector(response) item = AnjukeItem() item = self._item_init(item) try: fang_info = {'title': '', 'info': '', 'desc': '', 'pic_tab': ''} url = item['url'] = response.url fang_id = item['fang_id'] = (re.search(r'\d+_\d+', url)).group(0) item['body'] = (response.body).decode('gbk').encode('utf8') try: fang_info['title'] = sel.xpath( '//div[@class="mainBoxL"]/div[@class="title"]').extract( )[0] except Exception as e: print Exception, ":", e try: fang_info['info'] = sel.xpath( '//div[@class="houseInfor clearfix"]/div[@class="inforTxt"]' ).extract()[0] except Exception as e: print Exception, ":", e try: fang_info['desc'] = sel.xpath( '//div[@id="hsPro-pos"]/div[@class="describe mt10"]' ).extract()[0] except Exception as e: print Exception, ":", e try: fang_info['pic_tab'] = sel.xpath( '//div[@id="hsPic-pos"]').extract()[0] except Exception as e: print Exception, ":", e m = hashlib.md5() m.update(str(fang_info)) follow_value = m.hexdigest() yield item except Exception as e: print Exception, ":", e
class anjuke_spider(CrawlSpider): spider_id = global_spider.get_spider_id() global_spider.spider_id_add() name = "anjuke_spider%s" % spider_id batch_id = 0 start_urls = [] allowed_domains = ["fang.com"] @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): for req in self.start_requests(): self.crawler.engine.crawl(req, spider=self) raise DontCloseSpider def start_requests(self): url_format = 'http://esf.fang.com/house/' try: for page_index in range(1, 31): url = url_format + 'i3%s/' % (page_index) yield Request(url=url, method='GET', callback=self.parse, dont_filter=True, meta={ 'submit_time': str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())), 'page_index': page_index, 'batch_id': self.batch_id }) self.batch_id += 1 except Exception as e: print Exception, ":", e def _item_init(self, item): item['fang_id'] = '' item['batch_id'] = '' item['submit_time'] = '' item['schedule_time'] = '' item['received_time'] = '' item['page_index'] = '' item['rank'] = '' item['update_tag'] = '' item['update_time'] = '' item['server_time'] = '' return item def parse(self, response): item = AnjukeItem() item = self._item_init(item) sel = Selector(response) item['batch_id'] = batch_id = response.meta['batch_id'] item['submit_time'] = submit_time = response.meta['submit_time'] item['schedule_time'] = schedule_time = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(response.meta['schedule_time']))) item['received_time'] = received_time = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(response.meta['received_time']))) item['page_index'] = page_index = response.meta['page_index'] server_time = time.mktime( time.strptime(response.headers['Date'], "%a, %d %b %Y %H:%M:%S %Z")) + 8 * 3600 item['server_time'] = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(server_time))) try: if sel.xpath('//div[@class="list sorry_word"]') != []: if 'retry_count' in response.meta: retry_count = int(response.meta['retry_count']) else: retry_count = 0 if retry_count <= 2: print "retry......" yield Request(url=response.url, method='GET', callback=self.parse, meta={ 'submit_time': submit_time, 'schedule_time': schedule_time, 'received_time': received_time, 'retry_count': retry_count + 1, 'page_index': page_index, 'batch_id': batch_id }) else: return if len(sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]')) > 30: dl_list = sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]') for dl_index in range(1, len(dl_list)): try: item['fang_id'] = fang_id = (re.search( r'\d_\d+', (dl_list[dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="title"]/a/@href' ).extract()[0]))).group(0) item['rank'] = rank = 30 * (page_index - 1) + dl_index item['update_tag'] = update_tag = dl_list[ dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()' ).extract()[0] if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('秒前更新'.decode("utf-8"), '')) item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('分钟前更新'.decode("utf-8"), '')) * 60 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('小时前更新'.decode("utf-8"), '')) * 3600 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+天前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('天前更新'.decode("utf-8"), '')) * 3600 * 24 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) except Exception as e: print Exception, ":", e yield item else: dl_list = sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]') for dl_index in range(0, len(dl_list)): try: item['fang_id'] = fang_id = (re.search( r'\d_\d+', (dl_list[dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="title"]/a/@href' ).extract()[0]))).group(0) item['rank'] = rank = 30 * (page_index - 1) + dl_index + 1 item['update_tag'] = update_tag = dl_list[ dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()' ).extract()[0] if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('秒前更新'.decode("utf-8"), '')) item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('分钟前更新'.decode("utf-8"), '')) * 60 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('小时前更新'.decode("utf-8"), '')) * 3600 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+天前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('天前更新'.decode("utf-8"), '')) * 3600 * 24 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) except Exception as e: print Exception, ":", e yield item except Exception as e: print Exception, ":", e