def get_city(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request if response.m_response.content == "": request = Request( url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&", callback="get_all_page", priority=1) request.meta["city_name"] = "" request.meta["city_id"] = "" request.meta["province_name"] = response.request.meta["province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request else: soup = bs(response.m_response.content, "lxml") city_list = soup.select("a") for city in city_list: city_name = city.string.strip() city_id = city["data-value"].strip() request = Request( url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&city=" + city_id + "&", callback="get_all_page", priority=1) request.meta["city_name"] = city_name request.meta["city_id"] = city_id request.meta["province_name"] = response.request.meta["province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request
def get_content(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request soup = bs(response.m_response.content, "lxml") content_list = soup.select("table.m_srchList tbody tr") for content in content_list: try: result_item = dict() result_item["province"] = response.request.meta["province_name"] result_item["city"] = response.request.meta["city_name"] result_item["company_name"] = content.select("td")[1].text.split('\n')[0].strip() result_item["company_man"] = content.select("td")[1].text.split('\n')[1].strip().replace("企业法人:", "") result_item["company_telephone"] = content.select("td")[1].text.split('\n')[2].strip().replace("联系方式:", "") result_item["company_address"] = content.select("td")[1].text.split('\n')[3].strip() if "地址:" in result_item["company_address"]: result_item["company_address"] = result_item["company_address"].replace("地址:", "") else: result_item["company_address"] = "" result_item["company_registered_capital"] = content.select("td")[2].text.strip() result_item["company_registered_time"] = content.select("td")[3].text.strip() result_item["company_status"] = content.select("td")[4].text.strip() result_item["source"] = "企查查" result_item["update_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield result_item except Exception: print traceback.format_exc()
def process_page_2(self, response): soup = bs(response.m_response.content, 'lxml') detail_list = soup.select('div.houseList dl') for detail in detail_list: estate = detail.select('p.mt15 span.spName')[0].text detail_str = detail.select('p.mt10')[0].text temp_list = detail.select('p.mt10')[0].text.split('/') temp_list = [temp.strip() for temp in temp_list] if '购物中心/百货' not in detail_str and '层' in detail_str: m_type = temp_list[0].replace('类型:', '') floor = temp_list[1] total_floor = temp_list[2].replace('层', '') elif '购物中心/百货' not in detail_str and '层' not in detail_str: m_type = temp_list[0].strip().replace('类型:', '') floor = '未知' total_floor = '未知' elif '购物中心/百货' in detail_str and '层' not in detail_str: m_type = temp_list[0].replace('类型:', '') + temp_list[1] floor = '未知' total_floor = '未知' elif '购物中心/百货' in detail_str and '层' in detail_str: m_type = temp_list[0].replace('类型:', '') + temp_list[1] floor = temp_list[2] total_floor = temp_list[3].replace('层', '') else: logger.error('unexpective detail_str: ' + detail_str.strip()) area = detail.select('div.area')[0].text.replace('㎡', '').replace( '建筑面积', '') total_price = detail.select( 'div.moreInfo p.mt5 span.price')[0].text crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) item = dict() item['estate'] = estate item['floor'] = floor item['total_floor'] = total_floor item['type'] = m_type item['area'] = area item['total_price'] = total_price item['crawl_date'] = crawl_date item['city'] = response.request.meta['city'] item['district'] = response.request.meta['district'] item['url'] = response.request.url yield item next_page = soup.select('a#PageControl1_hlk_next') if len(next_page) > 0: url = response.nice_join(next_page[0]['href']) + '/' request = Request(url=url, priority=2, callback=self.process_page_2) request.meta['city'] = response.request.meta['city'] request.meta['district'] = response.request.meta['district'] yield request
def process_item(self, item): try: with codecs.open("bendibao.csv", 'a', 'gbk') as f: f.write(item["city_name"] + ',' + item["category1_name"] + ',' + item["category2_name"] + ',' + item["result_name"] + ',' + item["result_mobile"] + "\n") except: logger.error(traceback.format_exc())
def process_item(self, item): try: with codecs.open("fang.csv", 'a', 'gbk') as f: f.write(item["province"] + ',' + item["city"] + ',' + item["district"] + ',' + item["avg_price"] + ',' + item["estate"].replace(',', ',') + ',' + item["area"] + ',' + item["layout"] + ',' + item["total_price"] + ',' + item["crawl_date"] + ',' + item["url"] + "\n") except: logger.error(traceback.format_exc())
def get_all_page(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request else: soup = bs(response.m_response.content, "lxml") try: temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace("...", "")) else: total_page = 1 else: temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findPreviousSibling() if temp_page: page = temp_page.select_one("a") if page: total_page = int(page.string.strip().replace("...", "")) else: total_page = 1 else: total_page = 1 except: total_page = 1 now_page = 1 while now_page <= total_page: if response.request.meta["city_id"] == "": request = Request( url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" + response.request.meta["province_id"] + "&p=" + str(now_page) + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta["city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta["province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request else: request = Request( url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str( now_page) + "&province=" + response.request.meta["province_id"] + "&city=" + response.request.meta[ "city_id"] + "&", callback="get_content", priority=2) request.meta["city_name"] = response.request.meta["city_name"] request.meta["city_id"] = response.request.meta["city_id"] request.meta["province_name"] = response.request.meta["province_name"] request.meta["province_id"] = response.request.meta["province_id"] yield request now_page += 1
def process_item(self, item): try: with codecs.open("result.csv", 'a', 'gbk') as f: f.write(item["province"] + ',' + item["city"] + ',' + item["brand"].replace(u'\u30fb', '·') + ',' + item["cars_line"].replace(u'\u30fb', '·') + ',' + item["car"].replace(u'\u30fb', '·') + ',' + item["mileage"] + ',' + item["first_borad_date"] + ',' + item["gear"] + ',' + item["displacement"] + ',' + item["price"] + ',' + item["crawl_date"] + "\n") except: logger.error(traceback.format_exc())
def process(self, response): if not response.m_response: logger.error(response.request.url) yield response.request if '<script>window.location.href=' in response.m_response.content: logger.error(response.m_response.content + "\n" + response.request.url) yield response.request soup = bs(response.m_response.content, "lxml") province_list = soup.select_one("dl#provinceOld").select("div.pull-left")[1].select("dd a") for province in province_list: province_name = province.string.strip() province_id = province["data-value"].strip() request = Request( url="http://www.qichacha.com/search_getCityListHtml?province=" + province_id + "&q_type=1", callback="get_city", priority=0) request.meta["province_name"] = province_name request.meta["province_id"] = province_id yield request
def wrapper(self, response): if not response.m_response: if response.m_response is None: logger.error('response.m_response is None and url : ' + response.request.url + ' and request has been push to queue again!') else: logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】 and url : ' + response.request.url + ' content:' + response.m_response.content + ' and request has been push to queue again!') yield response.request else: process = func(self, response) if process is not None: try: start = time.clock() for callback in process: yield callback logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start)) except Exception: logger.error('process error: ' + response.request.url + '\r\n' + response.m_response.content + '\r\n' + traceback.format_exc())
def wrapper(self, response): if not response.m_response: response.request.meta['retry'] += 1 # 最多重试3次 if response.request.meta['retry'] < 4: retry_str = '\nrequest has been push to queue again!' yield response.request else: retry_str = '\nrequest has been try max times! will not push again!' if response.m_response is None: logger.error('response.m_response is None' + '\nURL : ' + response.request.url + retry_str) else: # 记录返回数据 log_name = 'log/' + str(uuid.uuid1()) + '_log.txt' with open(log_name, 'wb') as f: f.write(response.m_response.content) logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】' + '\nURL : ' + response.request.url + '\nresponse: ' + log_name + retry_str) else: try: process = func(self, response) if isinstance(process, types.GeneratorType): for callback in process: yield callback except Exception: # 记录返回数据 log_name = 'log/' + str(uuid.uuid1()) + '_log.txt' with open(log_name, 'wb') as f: f.write(response.m_response.content) logger.error('process error: ' + response.request.url + '\nresponse: ' + log_name + '\n' + traceback.format_exc())
def wrapper(self, response): if not response.m_response: if response.m_response is None: logger.error('response.m_response is None and url : ' + response.request.url + ' and request has been push to queue again!') else: logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】 and url : ' + response.request.url + ' content:' + response.m_response.content + ' and request has been push to queue again!') yield response.request else: process = func(self, response) if process is not None: try: for callback in process: yield callback except Exception: logger.error('process error: ' + response.request.url + '\r\n' + response.m_response.content + '\r\n' + traceback.format_exc())
def exception_handler(request, exception): logger.error("%s %s" % (request.url, exception))
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import traceback from sasila.system_normal.utils import logger reload(sys) sys.setdefaultencoding('utf-8') try: l1 = list() a = l1[0] print a except Exception: logger.error(traceback.format_exc())