Example #1
0
 def get_city(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" + response.request.url)
         yield response.request
     if response.m_response.content == "":
         request = Request(
                 url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                     response.request.meta["province_id"] + "&",
                 callback="get_all_page", priority=1)
         request.meta["city_name"] = ""
         request.meta["city_id"] = ""
         request.meta["province_name"] = response.request.meta["province_name"]
         request.meta["province_id"] = response.request.meta["province_id"]
         yield request
     else:
         soup = bs(response.m_response.content, "lxml")
         city_list = soup.select("a")
         for city in city_list:
             city_name = city.string.strip()
             city_id = city["data-value"].strip()
             request = Request(
                     url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                         response.request.meta["province_id"] + "&city=" + city_id + "&",
                     callback="get_all_page", priority=1)
             request.meta["city_name"] = city_name
             request.meta["city_id"] = city_id
             request.meta["province_name"] = response.request.meta["province_name"]
             request.meta["province_id"] = response.request.meta["province_id"]
             yield request
Example #2
0
 def get_content(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" + response.request.url)
         yield response.request
     soup = bs(response.m_response.content, "lxml")
     content_list = soup.select("table.m_srchList tbody tr")
     for content in content_list:
         try:
             result_item = dict()
             result_item["province"] = response.request.meta["province_name"]
             result_item["city"] = response.request.meta["city_name"]
             result_item["company_name"] = content.select("td")[1].text.split('\n')[0].strip()
             result_item["company_man"] = content.select("td")[1].text.split('\n')[1].strip().replace("企业法人:", "")
             result_item["company_telephone"] = content.select("td")[1].text.split('\n')[2].strip().replace("联系方式:",
                                                                                                            "")
             result_item["company_address"] = content.select("td")[1].text.split('\n')[3].strip()
             if "地址:" in result_item["company_address"]:
                 result_item["company_address"] = result_item["company_address"].replace("地址:", "")
             else:
                 result_item["company_address"] = ""
             result_item["company_registered_capital"] = content.select("td")[2].text.strip()
             result_item["company_registered_time"] = content.select("td")[3].text.strip()
             result_item["company_status"] = content.select("td")[4].text.strip()
             result_item["source"] = "企查查"
             result_item["update_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             yield result_item
         except Exception:
             print traceback.format_exc()
Example #3
0
    def process_page_2(self, response):
        soup = bs(response.m_response.content, 'lxml')
        detail_list = soup.select('div.houseList dl')
        for detail in detail_list:
            estate = detail.select('p.mt15 span.spName')[0].text
            detail_str = detail.select('p.mt10')[0].text

            temp_list = detail.select('p.mt10')[0].text.split('/')
            temp_list = [temp.strip() for temp in temp_list]

            if '购物中心/百货' not in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '')
                floor = temp_list[1]
                total_floor = temp_list[2].replace('层', '')
            elif '购物中心/百货' not in detail_str and '层' not in detail_str:
                m_type = temp_list[0].strip().replace('类型:', '')
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' not in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = '未知'
                total_floor = '未知'
            elif '购物中心/百货' in detail_str and '层' in detail_str:
                m_type = temp_list[0].replace('类型:', '') + temp_list[1]
                floor = temp_list[2]
                total_floor = temp_list[3].replace('层', '')
            else:
                logger.error('unexpective detail_str: ' + detail_str.strip())

            area = detail.select('div.area')[0].text.replace('㎡', '').replace(
                '建筑面积', '')
            total_price = detail.select(
                'div.moreInfo p.mt5 span.price')[0].text
            crawl_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))

            item = dict()
            item['estate'] = estate
            item['floor'] = floor
            item['total_floor'] = total_floor
            item['type'] = m_type
            item['area'] = area
            item['total_price'] = total_price
            item['crawl_date'] = crawl_date

            item['city'] = response.request.meta['city']
            item['district'] = response.request.meta['district']
            item['url'] = response.request.url
            yield item

        next_page = soup.select('a#PageControl1_hlk_next')
        if len(next_page) > 0:
            url = response.nice_join(next_page[0]['href']) + '/'
            request = Request(url=url,
                              priority=2,
                              callback=self.process_page_2)
            request.meta['city'] = response.request.meta['city']
            request.meta['district'] = response.request.meta['district']
            yield request
Example #4
0
 def process_item(self, item):
     try:
         with codecs.open("bendibao.csv", 'a', 'gbk') as f:
             f.write(item["city_name"] + ',' + item["category1_name"] +
                     ',' + item["category2_name"] + ',' +
                     item["result_name"] + ',' + item["result_mobile"] +
                     "\n")
     except:
         logger.error(traceback.format_exc())
Example #5
0
 def process_item(self, item):
     try:
         with codecs.open("fang.csv", 'a', 'gbk') as f:
             f.write(item["province"] + ',' + item["city"] + ',' +
                     item["district"] + ',' + item["avg_price"] + ',' +
                     item["estate"].replace(',', ',') + ',' + item["area"] +
                     ',' + item["layout"] + ',' + item["total_price"] +
                     ',' + item["crawl_date"] + ',' + item["url"] + "\n")
     except:
         logger.error(traceback.format_exc())
Example #6
0
    def get_all_page(self, response):
        if not response.m_response:
            logger.error(response.request.url)
            yield response.request
        if '<script>window.location.href=' in response.m_response.content:
            logger.error(response.m_response.content + "\n" + response.request.url)
            yield response.request
        else:
            soup = bs(response.m_response.content, "lxml")
            try:
                temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findNextSibling()
                if temp_page:
                    page = temp_page.select_one("a")
                    if page:
                        total_page = int(page.string.strip().replace("...", ""))
                    else:
                        total_page = 1
                else:
                    temp_page = soup.find(lambda tag: tag.name == 'a' and '>' == tag.text).parent.findPreviousSibling()
                    if temp_page:
                        page = temp_page.select_one("a")
                        if page:
                            total_page = int(page.string.strip().replace("...", ""))
                        else:
                            total_page = 1
                    else:
                        total_page = 1
            except:
                total_page = 1

            now_page = 1
            while now_page <= total_page:
                if response.request.meta["city_id"] == "":
                    request = Request(
                            url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&province=" +
                                response.request.meta["province_id"] + "&p=" + str(now_page) + "&",
                            callback="get_content", priority=2)
                    request.meta["city_name"] = response.request.meta["city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta["province_name"]
                    request.meta["province_id"] = response.request.meta["province_id"]
                    yield request
                else:
                    request = Request(
                            url="http://www.qichacha.com/search_index?key=%25E5%25B0%258F%25E9%25A2%259D%25E8%25B4%25B7%25E6%25AC%25BE&ajaxflag=1&p=" + str(
                                    now_page) + "&province=" +
                                response.request.meta["province_id"] + "&city=" + response.request.meta[
                                    "city_id"] + "&",
                            callback="get_content", priority=2)
                    request.meta["city_name"] = response.request.meta["city_name"]
                    request.meta["city_id"] = response.request.meta["city_id"]
                    request.meta["province_name"] = response.request.meta["province_name"]
                    request.meta["province_id"] = response.request.meta["province_id"]
                    yield request
                now_page += 1
Example #7
0
 def process_item(self, item):
     try:
         with codecs.open("result.csv", 'a', 'gbk') as f:
             f.write(item["province"] + ',' + item["city"] + ',' +
                     item["brand"].replace(u'\u30fb', '·') + ',' +
                     item["cars_line"].replace(u'\u30fb', '·') + ',' +
                     item["car"].replace(u'\u30fb', '·') + ',' +
                     item["mileage"] + ',' + item["first_borad_date"] +
                     ',' + item["gear"] + ',' + item["displacement"] + ',' +
                     item["price"] + ',' + item["crawl_date"] + "\n")
     except:
         logger.error(traceback.format_exc())
Example #8
0
 def process(self, response):
     if not response.m_response:
         logger.error(response.request.url)
         yield response.request
     if '<script>window.location.href=' in response.m_response.content:
         logger.error(response.m_response.content + "\n" + response.request.url)
         yield response.request
     soup = bs(response.m_response.content, "lxml")
     province_list = soup.select_one("dl#provinceOld").select("div.pull-left")[1].select("dd a")
     for province in province_list:
         province_name = province.string.strip()
         province_id = province["data-value"].strip()
         request = Request(
                 url="http://www.qichacha.com/search_getCityListHtml?province=" + province_id + "&q_type=1",
                 callback="get_city", priority=0)
         request.meta["province_name"] = province_name
         request.meta["province_id"] = province_id
         yield request
Example #9
0
 def wrapper(self, response):
     if not response.m_response:
         if response.m_response is None:
             logger.error('response.m_response is None and url : ' +
                          response.request.url +
                          ' and request has been push to queue again!')
         else:
             logger.error('response.m_response is failed 【' +
                          str(response.m_response.status_code) +
                          '】 and url : ' + response.request.url +
                          ' content:' + response.m_response.content +
                          ' and request has been push to queue again!')
         yield response.request
     else:
         process = func(self, response)
         if process is not None:
             try:
                 start = time.clock()
                 for callback in process:
                     yield callback
                 logger.info(func.__name__ + ' run time: ' +
                             '{:.9f}'.format(time.clock() - start))
             except Exception:
                 logger.error('process error: ' + response.request.url +
                              '\r\n' + response.m_response.content +
                              '\r\n' + traceback.format_exc())
Example #10
0
    def wrapper(self, response):
        if not response.m_response:
            response.request.meta['retry'] += 1
            # 最多重试3次
            if response.request.meta['retry'] < 4:
                retry_str = '\nrequest has been push to queue again!'
                yield response.request
            else:
                retry_str = '\nrequest has been try max times! will not push again!'

            if response.m_response is None:
                logger.error('response.m_response is None'
                             + '\nURL : ' + response.request.url
                             + retry_str)
            else:
                # 记录返回数据
                log_name = 'log/' + str(uuid.uuid1()) + '_log.txt'
                with open(log_name, 'wb') as f:
                    f.write(response.m_response.content)

                logger.error('response.m_response is failed 【' + str(response.m_response.status_code) + '】'
                             + '\nURL : ' + response.request.url
                             + '\nresponse: ' + log_name
                             + retry_str)
        else:
            try:
                process = func(self, response)
                if isinstance(process, types.GeneratorType):
                    for callback in process:
                        yield callback
            except Exception:
                # 记录返回数据
                log_name = 'log/' + str(uuid.uuid1()) + '_log.txt'
                with open(log_name, 'wb') as f:
                    f.write(response.m_response.content)

                logger.error('process error: ' + response.request.url
                             + '\nresponse: ' + log_name
                             + '\n' + traceback.format_exc())
Example #11
0
 def wrapper(self, response):
     if not response.m_response:
         if response.m_response is None:
             logger.error('response.m_response is None and url : ' +
                          response.request.url +
                          ' and request has been push to queue again!')
         else:
             logger.error('response.m_response is failed 【' +
                          str(response.m_response.status_code) +
                          '】 and url : ' + response.request.url +
                          ' content:' + response.m_response.content +
                          ' and request has been push to queue again!')
         yield response.request
     else:
         process = func(self, response)
         if process is not None:
             try:
                 for callback in process:
                     yield callback
             except Exception:
                 logger.error('process error: ' + response.request.url +
                              '\r\n' + response.m_response.content +
                              '\r\n' + traceback.format_exc())
Example #12
0
def exception_handler(request, exception):
    logger.error("%s %s" % (request.url, exception))
Example #13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import traceback
from sasila.system_normal.utils import logger

reload(sys)
sys.setdefaultencoding('utf-8')

try:
    l1 = list()
    a = l1[0]
    print a

except Exception:
    logger.error(traceback.format_exc())