Example #1
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.title = user.get("title", "")
        # print chardet.detect(self.title)
        self.title = urllib.quote(self.title.encode('utf8'))

        self.project_district = user.get("project_district")
        self.project_developer_name_value = user.get("project", "")
        self.date_filter_min = user.get("date_filter_min","")
        self.date_filter_max = user.get("date_filter_max","")
        # print self.title,self.project_developer_name_value,self.project_district,self.date_filter_max,self.date_filter_min

        # self.title = urllib.quote(self.title.decode(sys.stdin.encoding).encode('utf-8'))
        # self.project_district = urllib.quote(self.project_district.decode(sys.stdin.encoding).encode('utf8'))
        # self.project_developer_name_value = urllib.quote(self.project_developer_name_value.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_min = urllib.quote(self.date_filter_min.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_max = urllib.quote(self.date_filter_max.decode(sys.stdin.encoding).encode('utf8'))
        #
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")


        self.LoginUrl = "https://newhouse.cnnbfdc.com"


        self.result = user.get("result", "")
        self.GJJInfo = []
        self.bild = []
Example #2
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.fpjy = user.get("fpjy", "")
        self.fpdm = user.get("fpdm", "")
        self.fphm = user.get("fphm", "")
        self.kprq = user.get("kprq", "")
        self.fpje = user.get("fpje", "")
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        self.fpdm_area = self.fpdm[0:4]
        self.fpdm_url = AREA.get(self.fpdm_area, "")
        self.suiji = str(int(round(time.time() * 1000)))

        self.codeUrl = self.fpdm_url + '/WebQuery/yzmQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&r=' + str(
            '%.16f' %
            (random.random())) + '&v=V1.0.04_001' + '&nowtime=' + str(
                int(round(time.time() * 1000))
            ) + '&publickey=B8EE27C2CFEABABBD1DB92F4D84E4EA3&_=' + str(
                int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}
Example #3
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.username = urllib.quote(user.get("name", ""))
        self.idcard = urllib.quote(user.get("idcard", ""))
        self.area = urllib.quote(user.get("area", ""))
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        # urllib.quote(a)

        self.LoginUrl = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?" + "resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=" + self.idcard + "&iname=" + self.username + "&areaName=" + self.area + "&ie=utf-8&oe=utf-8&format=json&t=" + str(
            int(round(time.time() *
                      1000))) + "&cb=jQuery110207690611877233657_" + str(
                          int(round(time.time() * 1000))) + "&_=" + str(
                              int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}
        self.PayRecord = {}
Example #4
0
 def __init__(self):
     """
     初始化
     :param spider:
     :return:
     """
     # 线程数
     self.thread_num = THREAD_NUM
     # 队列数:1
     self.thread_q_size = THREAD_Q_SIZE
     # redis操作类
     self.redisUtils = RedisUtils()
Example #5
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.keyword = user.get("keyword", "")
        # self.gjjaccnum = self.username if len(self.username) <= 15 else ""
        # self.pwd = user.get("password", "")
        self.age = FANGAGE.get(user.get('age',''),'') or FANGAGE.get(user.get('year',''), '')
        self.token = user.get("token", "")
        self.flower = LOUCENG.get(user.get('flower',''),'') or LOUCENG.get(user.get('floor',''),'')
        self.hu_type = HUXING.get(user.get('hu_type',''),'') or HUXING.get(user.get('housetype',''),'')
        # self.userid = user.get("userid", "")

        self.startUrl = "http://esf.nb.fang.com/NewSecond/sale_info/searchlist_new2014.aspx"
        self.hostUrl = "http://esf.nb.fang.com/"
        self.result = user.get("result", "")
        self.GJJInfo = []
Example #6
0
def threadWork(t):
    """
    进程工作,用来启动多个线程
    :param x:
    :return:
    """
    redisUtils = RedisUtils()
    thread_pool = ThreadPool(THREAD_NUM * 10, q_size=THREAD_Q_SIZE)
    dict_json = redisUtils.getCons()
    logger.debug("Now have tasks -> " + str(dict_json))
    if dict_json != []:
        try:
            for i in dict_json:
                spider_name = re.findall('spider_(.*):task', i)[0]
                # for i in [i.split(":")[0] for i in dict_json]:
                requests = makeRequests(imptask, [spider_name])
                thread_pool.putRequest(requests[0])
            thread_pool.wait()
        except Exception as e:
            logger.error(e)
    else:
        time.sleep(1)
Example #7
0
    def __init__(self, dict_json, key=[], verifycode_type='png'):
        self.redisUtils = RedisUtils()
        self.damatuWeb = damatuWeb
        self.PROXYADDR = PROXYADDR
        self.dict_json = dict_json
        self.token = self.dict_json['token']
        self.verifycode_type = verifycode_type
        self.status = None
        self.desc = None

        self.current_milli_time = lambda: str(int(round(time.time() * 1000)))
        self.startTime = self.current_milli_time()

        self.realpath = os.path.split(os.path.realpath(__file__))[0]
        filename = 'verifycode/%s_verifycode.%s' % (self.startTime,
                                                    verifycode_type)
        self.code_path = os.path.join(self.realpath, filename)

        logging.config.fileConfig('unicom/logging.config')
        self.logger = logging.getLogger('flow')

        # 是否保存用户目录,默认为爬虫出错才保存,也可以手动修改
        self.rmuserdirFlag = False
        self.mkUserdir(key)
Example #8
0
class SpiderClient():
    def __init__(self):
        """
        初始化
        :param spider:
        :return:
        """

        # 线程数
        self.thread_num = THREAD_NUM
        # 队列数:1
        self.thread_q_size = THREAD_Q_SIZE
        # redis操作类
        self.redisUtils = RedisUtils()

    def progressWork(self):
        """
        进程工作,用来启动多个线程
        :param x:
        :return:
        """
        thread_pool = ThreadPool(self.thread_num, q_size=self.thread_q_size)
        for i in range(self.thread_num):
            dict_t = {}
            requests = makeRequests(self.threadWork, [dict_t])
            thread_pool.putRequest(requests[0])
        thread_pool.wait()

    # def threadWork(self, t):
    def threadWork(self):
        """
        线程工作的方法,主要用户获取任务
        :param t:
        :return:
        """
        startTime = datetime.now()
        logger.info('开始等待获取任务')
        while True:
            try:
                # 从任务队列中获取任务数据
                dict_json = self.redisUtils.getCon(SOURCE)
                if dict_json is not None:
                    dict_json = decryptKwargs(dict_json)
                    name = dict_json['token']
                    # 将token放入到thread中
                    logger.info("获取到任务,%s" % name)
                    # sendMail(u"邮件log测试",'*****@*****.**')
                    dict_json.update({"result": []})
                    self.taskWork(dict_json)
                    break
                else:
                    finishTime = datetime.now()
                    # abs():返回数字的绝对值
                    if abs(finishTime.minute - startTime.minute) >= WAITTING:
                        break
                    time.sleep(1)
            except Exception:
                s = traceback.format_exc()
                logger.error(s)

    @Time()
    def taskWork(self, dict_json):
        """
        具体任务的工作方法,主要调用爬虫完成数据爬取
        :param dict_json:
        :return:
        """
        # token = dict_json['token']
        try:
            client = SpiderMain(dict_json)
            f = client.crawl()
            logger.info('任务结束:%s' % dict_json['token'])
            # 抓取成功则删除所保存的样本,如需保存可在爬虫中返回Ture
            if not f:
                client.rmUserdir(client.userdir)

            # logger.info("不需要抓取图片验证码,token:%s" % token)
            # p1 = threading.Thread(target=client.crawl, args=("user",))
            # p2 = threading.Thread(target=client.crawl)
            # p1.start()
            # p2.start()

        except Exception:
            self.redisUtils.setNotify(token=dict_json['token'],
                                      val='2',
                                      decs='爬虫爬取失败')
            s = traceback.format_exc()
            logger.error(s)
            logger.info('任务结束:%s' % dict_json['token'])
Example #9
0
    def __init__(self, user):
        self.url_area = {
            "北京": "http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "天津": "http://tj.gsxt.gov.cn/index.html",
            "河北": "http://he.gsxt.gov.cn/notice/",
            "山西": "http://sx.gsxt.gov.cn/index.jspx",
            "内蒙古": "http://nm.gsxt.gov.cn:58888/",
            "辽宁": "http://ln.gsxt.gov.cn/saicpub/",
            "吉林": "http://jl.gsxt.gov.cn/",
            "黑龙江": "http://hl.gsxt.gov.cn/index.jspx",
            "上海": "http://sh.gsxt.gov.cn/notice",
            "江苏": "http://www.jsgsj.gov.cn:58888/province/",
            "浙江": "http://zj.gsxt.gov.cn/client/entsearch/toEntSearch",
            "安徽": "http://ah.gsxt.gov.cn/index.jspx",
            "福建": "http://fj.gsxt.gov.cn/notice",
            "江西": "http://jx.gsxt.gov.cn/",
            "山东": "http://sd.gsxt.gov.cn/",
            "广东": "http://gd.gsxt.gov.cn/",
            "广西": "http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "海南": "http://hi.gsxt.gov.cn/index.jspx",
            "河南": "http://ha.gsxt.gov.cn/index.jspx",
            "湖北": "http://hb.gsxt.gov.cn/index.jspx",
            "湖南": "http://hn.gsxt.gov.cn/notice/",
            "重庆": "http://cq.gsxt.gov.cn/",
            "四川": "http://sc.gsxt.gov.cn/notice/",
            "贵州": "http://gz.gsxt.gov.cn/",
            "云南": "http://yn.gsxt.gov.cn/notice/",
            "西藏": "http://xz.gsxt.gov.cn/index.jspx",
            "陕西": "http://sn.gsxt.gov.cn/ztxy.do?method=index&random=",
            "甘肃": "http://gs.gsxt.gov.cn/gsxygs/",
            "青海": "http://qh.gsxt.gov.cn/index.jspx",
            "宁夏": "http://nx.gsxt.gov.cn/",
            "新疆": "http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml"
        }

        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""
        self.area = user.get("area", "")

        self.keyword = user.get("idCard", "")
        self.token = user.get("token", "")
        self.LoginUrl = self.url_area.get(
            self.area,
            "") if self.area else "http://www.gsxt.gov.cn/index.html"
        self.result = user.get("result", "")
        self.GJJInfo = []
        self.br = self.get_webdriver("chrome")

        self.br.dc = DriverClean(1, time.time(), self.br.service.process.pid,
                                 self.br)
        self.br.get1 = MethodType(get1, self.br, webdriver.Chrome)
        self.br.find_element_by_xpath1 = MethodType(find_element_by_xpath1,
                                                    self.br, webdriver.Chrome)

        global globallogger
        globallogger = self.logger

        # self.br.maximize_window()
        # self.br.set_window_size(1300,900)
        # self.proxy = self._proxy()
        # proxy=webdriver.Proxy()
        # proxy.proxy_type=ProxyType.MANUAL
        # proxy.http_proxy=self.proxy
        # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中

        # proxy.add_to_capabilities(self.dcap)
        # self.br.start_session(self.dcap)
        # self.br.get('http://httpbin.org/ip')
        # print self.br.page_source
        self.wait = WebDriverWait(self.br, 10, 0.5)
        self.br.set_page_load_timeout(10)
        self.br.set_script_timeout(15)
        self.br.implicitly_wait(10)
Example #10
0
class SpiderMain(hypc_accumulation_fund):

    logger = logging.getLogger()

    def __init__(self, user):
        self.url_area = {
            "北京": "http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "天津": "http://tj.gsxt.gov.cn/index.html",
            "河北": "http://he.gsxt.gov.cn/notice/",
            "山西": "http://sx.gsxt.gov.cn/index.jspx",
            "内蒙古": "http://nm.gsxt.gov.cn:58888/",
            "辽宁": "http://ln.gsxt.gov.cn/saicpub/",
            "吉林": "http://jl.gsxt.gov.cn/",
            "黑龙江": "http://hl.gsxt.gov.cn/index.jspx",
            "上海": "http://sh.gsxt.gov.cn/notice",
            "江苏": "http://www.jsgsj.gov.cn:58888/province/",
            "浙江": "http://zj.gsxt.gov.cn/client/entsearch/toEntSearch",
            "安徽": "http://ah.gsxt.gov.cn/index.jspx",
            "福建": "http://fj.gsxt.gov.cn/notice",
            "江西": "http://jx.gsxt.gov.cn/",
            "山东": "http://sd.gsxt.gov.cn/",
            "广东": "http://gd.gsxt.gov.cn/",
            "广西": "http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "海南": "http://hi.gsxt.gov.cn/index.jspx",
            "河南": "http://ha.gsxt.gov.cn/index.jspx",
            "湖北": "http://hb.gsxt.gov.cn/index.jspx",
            "湖南": "http://hn.gsxt.gov.cn/notice/",
            "重庆": "http://cq.gsxt.gov.cn/",
            "四川": "http://sc.gsxt.gov.cn/notice/",
            "贵州": "http://gz.gsxt.gov.cn/",
            "云南": "http://yn.gsxt.gov.cn/notice/",
            "西藏": "http://xz.gsxt.gov.cn/index.jspx",
            "陕西": "http://sn.gsxt.gov.cn/ztxy.do?method=index&random=",
            "甘肃": "http://gs.gsxt.gov.cn/gsxygs/",
            "青海": "http://qh.gsxt.gov.cn/index.jspx",
            "宁夏": "http://nx.gsxt.gov.cn/",
            "新疆": "http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml"
        }

        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""
        self.area = user.get("area", "")

        self.keyword = user.get("idCard", "")
        self.token = user.get("token", "")
        self.LoginUrl = self.url_area.get(
            self.area,
            "") if self.area else "http://www.gsxt.gov.cn/index.html"
        self.result = user.get("result", "")
        self.GJJInfo = []
        self.br = self.get_webdriver("chrome")

        self.br.dc = DriverClean(1, time.time(), self.br.service.process.pid,
                                 self.br)
        self.br.get1 = MethodType(get1, self.br, webdriver.Chrome)
        self.br.find_element_by_xpath1 = MethodType(find_element_by_xpath1,
                                                    self.br, webdriver.Chrome)

        global globallogger
        globallogger = self.logger

        # self.br.maximize_window()
        # self.br.set_window_size(1300,900)
        # self.proxy = self._proxy()
        # proxy=webdriver.Proxy()
        # proxy.proxy_type=ProxyType.MANUAL
        # proxy.http_proxy=self.proxy
        # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中

        # proxy.add_to_capabilities(self.dcap)
        # self.br.start_session(self.dcap)
        # self.br.get('http://httpbin.org/ip')
        # print self.br.page_source
        self.wait = WebDriverWait(self.br, 10, 0.5)
        self.br.set_page_load_timeout(10)
        self.br.set_script_timeout(15)
        self.br.implicitly_wait(10)
        # 加入当次代理
        # self.proxy = self._proxy()

    def _proxy(self):
        proxy = self.session.get(self.PROXYADDR).content
        # return {"http": "http://" + proxy, "https": "http://" + proxy}
        return proxy

    def run(self, keyword):
        content = self.hack_geetest(keyword.decode('utf8'))
        if not content[0].get("base_info", "") and not content[0].get(
                "admin_penalty_info", ""
        ) and not content[0].get(
                "operate_abnormal_info", ""
        ) and not content[0].get("key_person_info", "") and not content[0].get(
                "change_info", "") and not content[0].get(
                    "check_info", "") and not content[0].get(
                        "chattel_info", "") and not content[0].get(
                            "branch_info", "") and not content[0].get(
                                "equity_pledged_info",
                                "") and not content[0].get(
                                    "Shareholder_info",
                                    "") and not content[0].get(
                                        "judicial_assist_info",
                                        "") and not content[0].get(
                                            "knowledge_info",
                                            "") and not content[0].get(
                                                "brand_info",
                                                "") and not content[0].get(
                                                    "annual_shareholder_info",
                                                    "") and not content[0].get(
                                                        "annual_info", ""):
            self.status = PASSWORD_IS_NULL
        self.quit_webdriver()
        return content

    def wait_for(self, by1, by2):
        self.br.dc.setts(time.time())
        self.br.dc.setstatus(0)
        return self.wait.until(EC.presence_of_element_located((by1, by2)))

    def input_params(self, name):
        self.logger.info('正在打开官网URL')
        try:
            self.br.get1(self.LoginUrl)
            # self.br.refresh()
        except Exception as e:
            self.logger.error(e)
            try:
                # 防止再次刷新的时候出现卡死,所以没用热刷新的方式
                self.br.get1(self.br.current_url)
            except Exception as f:
                self.logger.error(f)

        self.logger.info('已经进入官网')
        ui.WebDriverWait(self.br, 10).until(
            EC.visibility_of_element_located(
                (By.XPATH, '//*[@id="btn_query"]')))
        element = self.wait_for(By.ID, "keyword")
        element.send_keys(name)
        element = self.wait_for(By.ID, "btn_query")
        element.click()
        self.status = CRAWL_SUCCESS

    def _save_captcha(self, codeurl):
        """
        下载验证码,返回图片b64编码,
        """
        self.logger.info("刷新验证码")
        try:
            codeContent = self.session.get(codeurl, headers=IMGHEADERS).content
            self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50])
            self.logger.info("下载验证码")
            self.status = NEED_BCODE
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 "captcha.png").replace("\\", "/"), 'wb') as f:
                f.write(codeContent)
            self.logger.info("验证码图片已保存!")
            bcode = base64.b64encode(codeContent)
            self.status = CRAWL_SUCCESS
            return bcode
        except:
            s = traceback.format_exc()
            self.logger.error("刷新验证码错误:%s" % s)
            self.status, self.desc = BCODE_IS_NULL, BCODE_IS_NULL_DESC
            # return {"error": "超时或代码异常"}

    def quit_webdriver(self):
        self.br.quit()
        self.br.dc.setterm(1)

    def get_webdriver(self, name):
        '''选择爬取的方式'''
        try:
            if name.lower() == "phantomjs":
                self.dcap = dict(DesiredCapabilities.PHANTOMJS)
                self.dcap[
                    "phantomjs.page.customHeaders.User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36"
                self.status = CRAWL_SUCCESS
                return webdriver.PhantomJS(desired_capabilities=self.dcap)
            elif name.lower() == "chrome":
                display = Display(visible=0, size=(1920, 1080))
                display.start()
                self.status = CRAWL_SUCCESS
                # return webdriver.Chrome("/usr/local/bin/chromedriver")
                return webdriver.Chrome()
            elif name.lower() == "firefox":
                display = Display(visible=0, size=(1920, 1080))
                display.start()
                self.status = CRAWL_SUCCESS
                return webdriver.Firefox()
        except Exception as e:
            self.logger.error('运行无头浏览器错误')
            self.status, self.decs = PASSWORD_IS_NULL, PASSWORD_IS_NULL_DESC

    def get_bcode(self):
        '''获取从redis中获取的坐标值'''
        self.startTime = datetime.now()
        while True:
            self.logger.info(u'等待用户传入坐标')
            inputValue = self.redisUtils.getNotify(self.token, 'bcode')
            if inputValue:
                coordinate = self.prase_bcode(inputValue)
                self.redisUtils.DelNotify(self.token, "bcode")
                return coordinate
            else:
                self.finishTime = datetime.now()
                tail_time = self.finishTime - self.startTime
                if tail_time.total_seconds() > 120:
                    self.logger.info('接收用输入超时:%s' % self.token)
                    self.desc = '接收用户输入超时'
                    time.sleep(1)
                    break
            time.sleep(1)

    def prase_bcode(self, zuobiao):
        '''坐标处理'''
        try:
            zuobiao_list1 = zuobiao.split('_')
            zuobiao_list = zuobiao_list1[1:]
            len_zuobiao = len(zuobiao_list) / 2
            coordinate = []
            for num in range(len_zuobiao):
                list_n = []
                list_n.append(zuobiao_list[2 * num])
                list_n.append(zuobiao_list[2 * num + 1])
                coordinate.append(list_n)
            self.status = CRAWL_SUCCESS
            return coordinate
        except Exception as e:
            self.logger.error('处理坐标异常: %s' % e)
            self.status = IDCARD_ERROR
            return ''

    def element_click(self, coor):
        '''模拟依次点击'''
        try:
            if coor and self.br.find_element_by_class_name("geetest_item_img"):
                for i in range(len(coor)):
                    element = self.br.find_element_by_class_name(
                        "geetest_item_img")
                    ActionChains(self.br).move_to_element_with_offset(
                        to_element=element,
                        xoffset=int(coor[i][0]) - 7,
                        yoffset=int(coor[i][1]) - 5).perform()
                    ActionChains(self.br).click().perform()
                    time.sleep(0.8)
                element_cli = self.wait_for(By.CLASS_NAME,
                                            "geetest_commit_tip")
                element_cli.click()
                time.sleep(0.5)
                element = self.wait_for(By.CLASS_NAME, "geetest_result_tip")
                ans = element.text.encode("utf-8")
                self.status = CRAWL_SUCCESS
                return ans
            else:
                self.logger.info('暂无图片点击')
                return '失败'
        except Exception as e:
            self.logger.error('破解验证码失败')
            self.status = PASSWORD_ERROR

    def click_pic(self, info):
        '''获取图片,将图片的base64发送给服务或第三方来处理'''
        self.logger.info('需要用户提供点击操作')
        try:
            ima_url = ifNotEmptyGetIndex(
                info.xpath("//*[@class='geetest_item_img']/@src"))
            card_base = self._save_captcha(ima_url)
            Data = {
                'token': self.token,
                'img_base64': card_base,
                'spider_name': 'gsxt',
                'userid': 'yinzhouyinhang'
            }
            content = self.session.post(url='http://127.0.0.1:8000/img',
                                        data=Data).content
            content = eval(content)
            url_imag = content.get("result", "")
            self.decs = '请输入url,请求图片并点击'
            redis_dict = {
                "image_url": url_imag,
                "image_base64": "data:image/jpg;base64," + card_base,
                "token": self.token
            }
            self.redisUtils.setNotify(token=self.token,
                                      val="1",
                                      decs=self.decs,
                                      result=redis_dict)
            self.br.dc.setstatus(1)
            self.br.dc.setts(time.time())
            self.logger.info("begin wait bcode....")
            co_ordinate = self.get_bcode()
            self.logger.info("begin set status....")
            self.br.dc.setstatus(0)
            self.br.dc.setts(time.time())
            ans = self.element_click(co_ordinate)
            return ans
        except Exception as e:
            self.logger.error(e)
            self.status = CRAWL_SUCCESS

    def hack_geetest(self, company='大连火眼征信管理有限公司北京'):
        '''爬取的流程'''
        try:
            self.input_params(company)
            for i in range(10):
                self.logger.info(u'开始判断验证码的类型')
                time.sleep(1)
                info = etree.HTML(str(self.br.page_source))
                if info.xpath("//*[@class='geetest_item_img']/@src"):
                    for j in range(3):
                        info2 = etree.HTML(str(self.br.page_source))
                        ans = self.click_pic(info2)
                        self.logger.info('破解验证码结果: %s' % ans)
                        if '成功' in ans:
                            ui.WebDriverWait(self.br, 10).until(
                                EC.visibility_of_element_located(
                                    (By.XPATH,
                                     '//*[@class="search_result_span1"]')))
                            self.status = CRAWL_SUCCESS
                            return country1(self.br).data()
                        elif '失败' in ans and j < 2:
                            time.sleep(3)
                            self.logger.info('图片验证点击失败')
                            self.status = BCODE_ERROR
                        elif '失败' in ans and j == 2:
                            self.logger.info('最后一次图片验证点击失败')
                            self.status = BCODE_ERROR
                            return
                        else:
                            self.logger.info('图片验证点击失败,正在重新请求')
                            self.status = BCODE_ERROR

                elif info.xpath(
                        "//*[@class='geetest_slider_track']/div/text()"):
                    self.logger.info('系统正在重试')
                    self.input_params(company)

                else:
                    time.sleep(2)
                    info = etree.HTML(str(self.br.page_source))
                    if info.xpath("//*[@class='ads-right']/div[1]/div/text()"):
                        self.status = CRAWL_SUCCESS
                        return country1(self.br).data()
                    else:
                        self.logger.info('尝试再次访问')
                        time.sleep(1)
                        self.input_params(company)
        except Exception as e:
            self.logger.error('智能检测程序错误%s' % e)
            self.status = CRAWL_FAIL
            # return e

    def login(self, flag):
        if self.area == '陕西':
            millis = int(round(time.time() * 1000))
            self.LoginUrl = self.LoginUrl + str(millis)
        try:
            content = self.run(self.keyword)
            return content
        except Exception as e:
            self.status = CRAWL_FAIL
            self.logger.error('抓取错误:%s' % e)

    @Time()
    def crawl(self, flag=""):
        CurTime = datetime.now().strftime("%Y-%m-%d")
        PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d")
        try:

            content = self.login(flag)
            self.GJJInfo.append(content)
            self.result.append(self.GJJInfo[0])

        except:
            s = traceback.format_exc()
            self.logger.error("抓取错误:%s" % s)
            self.status, self.desc = self.status, PROGRAM_ERROR_DESC
        finally:
            try:
                if len(self.result) == 1 and self.status == CRAWL_SUCCESS:
                    self.desc = CRAWL_SUCCESS_DESC
                    result_json = json.dumps(self.result[0],
                                             ensure_ascii=False)
                    print result_json
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val="1",
                                              decs="抓取成功!",
                                              result=result_json)
                elif self.status == CRAWL_FAIL:
                    self.desc = CRAWL_FAIL_DESC

                elif self.status == CRAWL_TIMEOUT:
                    self.desc = CRAWL_TIMEOUT_DESC

                elif self.status == IDCARD_ERROR:
                    self.desc = IDCARD_ERROR_DESC

                elif self.status == PASSWORD_ERROR:
                    self.desc = PASSWORD_ERROR_DESC

                elif self.status == BCODE_ERROR:
                    self.desc = BCODE_ERROR_DESC

                elif self.status == PASSWORD_IS_NULL:
                    self.desc = PASSWORD_IS_NULL_DESC
                else:
                    self.desc = PROGRAM_ERROR_DESC

            except Exception as e:
                s = traceback.format_exc()
                self.logger.error(s)

            finally:
                try:
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val=self.status,
                                              decs=self.desc)
                except Exception:
                    s = traceback.format_exc()
                    self.logger.error(s)

    def zipToStr(self, content):
        '''
        使用urllib2获取到的内容被压缩,需要进行解压缩
        :param content: 需要解压的内容
        :return:
        '''
        try:
            conn = zlib.decompress(content, 16 + zlib.MAX_WBITS)
            return conn
        except:
            self.logger.error('解压缩响应内容出错%s' % traceback.format_exc())
            raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
Example #11
0
class SpiderClient():

    def __init__(self):
        """
        初始化
        :param spider:
        :return:
        """

        # 线程数
        self.thread_num = THREAD_NUM
        # 队列数:1
        self.thread_q_size = THREAD_Q_SIZE
        # redis操作类
        self.redisUtils = RedisUtils()


    def progressWork(self):
        """
        进程工作,用来启动多个线程
        :param x:
        :return:
        """
        thread_pool = ThreadPool(self.thread_num, q_size=self.thread_q_size)
        for i in range(self.thread_num):
            dict_t = {}
            requests = makeRequests(self.threadWork, [dict_t])
            thread_pool.putRequest(requests[0])
        thread_pool.wait()


    def threadWork(self, t):
        """
        线程工作的方法,主要用户获取任务
        :param t:
        :return:
        """
        startTime = datetime.now()
        while True:
            try:
                # 从任务队列中获取任务数据
                dict_json = self.redisUtils.getCon(SOURCE)
                # print dict_json,'1234'
                if dict_json is not None:
                    dict_json = decryptKwargs(dict_json)
                    name = dict_json.get('keyword','')
                    # 将token放入到thread中
                    logger.info("获取到任务,%s" % name)
                    # sendMail(u"邮件log测试",'*****@*****.**')
                    dict_json.update({"result": []})
                    self.taskWork(dict_json)
                else:

                    finishTime = datetime.now()
                    if abs(finishTime.minute - startTime.minute) >= WAITTING:
                        break
                    time.sleep(1)
            except Exception:
                s = traceback.format_exc()
                logger.error(s)

    @Time()
    def taskWork(self, dict_json):
        """
        具体任务的工作方法,主要调用爬虫完成数据爬取
        :param dict_json:
        :return:
        """
        token = dict_json['token']
        try:
            client = SpiderMain(dict_json)
            logger.info("不需要抓取图片验证码,token:%s" % token)
            p1 = threading.Thread(target=client.crawl, args=("user",))
            # p2 = threading.Thread(target=client.crawl, args=("auto",))
            p1.start()
            # p2.start()

        except Exception:
            s = traceback.format_exc()
            logger.error(s)
Example #12
0
class SpiderMain(hypc_translate):

    logger = logging.getLogger()

    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.title = user.get("title", "")
        # print chardet.detect(self.title)
        self.title = urllib.quote(self.title.encode('utf8'))

        self.project_district = user.get("project_district")
        self.project_developer_name_value = user.get("project", "")
        self.date_filter_min = user.get("date_filter_min","")
        self.date_filter_max = user.get("date_filter_max","")
        # print self.title,self.project_developer_name_value,self.project_district,self.date_filter_max,self.date_filter_min

        # self.title = urllib.quote(self.title.decode(sys.stdin.encoding).encode('utf-8'))
        # self.project_district = urllib.quote(self.project_district.decode(sys.stdin.encoding).encode('utf8'))
        # self.project_developer_name_value = urllib.quote(self.project_developer_name_value.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_min = urllib.quote(self.date_filter_min.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_max = urllib.quote(self.date_filter_max.decode(sys.stdin.encoding).encode('utf8'))
        #
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")


        self.LoginUrl = "https://newhouse.cnnbfdc.com"


        self.result = user.get("result", "")
        self.GJJInfo = []
        self.bild = []


        # 加入当次代理
        # self.proxy = self._proxy()
    def ifNotEmptyGetIndex(self, somelist, index=0):
        """check to see it's not empty"""
        if somelist: 
            return somelist[index]
        else:
            return ''

    def _proxy(self):
        proxy = self.session.get(self.PROXYADDR).content
        return {"http": "http://" + proxy, "https": "http://" + proxy}

    def _errhtmlRecord(self, content):
        '''
        错误页面保存
        '''
        self.logger.info("保存错页内容")
        try:
            filename = str(uuid.uuid1()) + ".html"
            sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(str(content))
            self.logger.debug("已保存错页内容到{0}".format(filename))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存错页出错")
            self.logger.warn("{0}".format(s))

    def _sampleRecord(self, filename, content):
        '''
        保存网页内容
        '''
        self.logger.info("保存网页内容")
        try:
            sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存网页内容到{0}".format(sampleDir))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存网页出错")
            self.logger.warn("{0}".format(s))

    def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None):
        '''
        抓取方法
        '''
        self.logger.info("开始抓取 {0}".format(url))
        if header:
            headers = header
            self.logger.debug("伪装头:{0}".format(headers))
        else:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"}
            self.logger.debug("伪装头:{0}".format(headers))
        for ir in range(REQUEST_RETRY):
            try:
                self.logger.debug("第{0}次 抓取".format(ir))
                if data:
                    if proxy:
                        content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy)
                        self.logger.debug("POST url:{0}, data:{1}, proxy: {2}".format(url, data, proxy))
                    else:
                        content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False)
                        self.logger.debug("POST url:{0}, data:{1}".format(url, data))
                else:
                    if proxy:
                        content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy)
                        self.logger.debug("POST url:{0}, proxy: {1}".format(url, proxy))
                    else:
                        content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False)
                        # print content.encoding
                        self.logger.debug("Get url:{0}".format(url))
                if fileName and SAMPLEFLAG:
                    self._sampleRecord(fileName, content.content)
                return content
            except:
                self.logger.error(traceback.format_exc())
        self.logger.error("request url {0} failed ,check pls".format(url))
        self.status = CRAWL_TIMEOUT
        raise Exception("Failed to load url (%s)" % url)

    def login(self,flag):
        first_url = "https://newhouse.cnnbfdc.com/publicity/project-licenses?title="+str(self.title)+"&project_district="+str(self.project_district)+"&project_developer_name_value="+str(self.project_developer_name_value)+"&date_filter%5Bmin%5D%5Bdate%5D="+str(self.date_filter_min)+"&date_filter%5Bmax%5D%5Bdate%5D="+str(self.date_filter_min)
        content = self._fetchUrl(url=first_url, header=LOGINHEADERS, fileName="login.html")

        return str(content.text)

    @Time()
    def crawl(self, flag=""):
        CurTime = datetime.now().strftime("%Y-%m-%d")
        PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d")
        try:
            # login
            content = self.login(flag)
            url_num = re.compile(r'/project_license_view/([0-9]+)">')
            url_num_list = url_num.findall(content)
            if len(url_num_list) > 0:
                for url_num in url_num_list:
                    url_detail = self.LoginUrl + '/project_license_view/' + str(url_num)
                    self.logger.info("可查询到您查的信息:%s" % self.title)
                    # 项目详情信息
                    content = self._fetchUrl(url=url_detail, header=PERHEADERS, fileName="person.html")
                    detail = etree.HTML(content.text)
                    project_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[1]/h1/text()")) # 项目名称
                    alias_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[1]/div/text()")) # 别名
                    positioning = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[2]//span/text()"))#定位
                    company_name = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[3]//span/text()")) # 公司名称
                    project_id = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='project-detail__info']/div[4]/span/text()")) #项目编号
                    counts = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div[2]/div/div[1]/div/text()")) or self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div/div[2]/div[1]/div/strong/text()")) # 套数
                    area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div[2]/div/div[2]/div/text()")) or self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-entity-view pane-node']/div[2]/div/div[2]/div[2]/div/strong/text()"))# 面积
                    #数据汇总
                    marketable_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[1]/div/text()"))# 可销售面积
                    sales_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[2]/div/text()"))#已销售面积
                    has_sold_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-first']/div/div[3]/div/text()")) #已销售非住宅面积
                    number_sellable_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[1]/div/text()")) # 可售户数
                    has_sold_number = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[2]/div/text()")) # 已销售户数
                    has_sold_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-panel panel-col-last']/div/div[3]/div/text()")) # 已销售非住宅户数
                    # 详细参数
                    permit_number = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[1]/td[2]/text()"))#    许可证号
                    permission_date = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[1]/td[4]/span/text()"))# 许可日期
                    sales_address = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[2]/td[2]/text()"))# 售楼地址
                    sales_call = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[2]/td[4]/text()"))# 售楼电话
                    number_buildings = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[3]/td[2]/text()"))# 幢数
                    construction_area = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[3]/td[4]/text()"))# 建筑面积
                    opening_time = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[4]/td[2]/span/text()"))# 开盘时间
                    supervision_account = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[4]/td[4]/text()"))# 资金监管账户
                    document_authority = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[5]/td[2]/text()"))# 证件发布机构
                    financial_bank = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='parameter-table']/tr[5]/td[4]/text()"))# 资金监管银行
                    #  楼栋信息
                    loudong_list = detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div")
                    if len(loudong_list) > 0:
                        for i in range(len(loudong_list)):
                            i = i + 1
                            num_floors = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[1]/div/text()")) # 楼号
                            total_floors = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[2]/div/text()")) # 总层数 
                            # total_houses = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[3]/div/text()")) # 总户数
                            total_houses = ''
                            permitted_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[3]/div/text()")) # 许可户数
                            has_sold_number_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[4]/div/text()")) # 已销售户数
                            has_sold_residential_households = self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[5]/div/text()")) # 已销售非住宅户数
                            
                            wangqian_list = []
                            wangqian = str(self.ifNotEmptyGetIndex(detail.xpath("//*[@class='panel-pane pane-views-panes pane-project-license-buildings-panel-pane-1']/div/div/div["+str(i)+"]/div[6]/span/a/@href"))) # 网签的URL
                            if wangqian:
                                lou_numb = re.compile(r'buildingId=([0-9]+)')
                                wang_numb = self.ifNotEmptyGetIndex(lou_numb.findall(wangqian))
                                wangqian_url = "https://newhouse.cnnbfdc.com//map-api-v1/building_units?args[]=" + wang_numb
                                wang_content = self._fetchUrl(url=wangqian_url, header=IMGHEADERS, fileName="wangqian.html")
                                wangqian_info = etree.HTML(str(wang_content.text))
                                
                                for j in range(len(wangqian_info.xpath("//result/item"))):
                                    j = j + 1
                                    if self.ifNotEmptyGetIndex(wangqian_info.xpath("//result/item["+str(j)+"]/state/text()")) == '3':
                                        number = self.ifNotEmptyGetIndex(wangqian_info.xpath("//result/item["+str(j)+"]/number/text()"))
                                        wangqian_list.append(number)
                            
                            self.bild.append(hypc_translate.detail_building(
                                num_floors = num_floors,
                                total_floors = total_floors,
                                total_houses = total_houses,
                                permitted_households = permitted_households,
                                has_sold_number_households = has_sold_number_households,
                                has_sold_residential_households = has_sold_residential_households,
                                wangqian_nubm=wangqian_list
                                ))
                    else:
                        self.bild.append(hypc_translate.detail_building(
                            num_floors = '',
                            total_floors = '',
                            total_houses = '',
                            permitted_households = '',
                            has_sold_number_households = '',
                            has_sold_residential_households = '',
                            ))


                    self.GJJInfo.append(hypc_translate.baseinfo(
                        project_name = project_name, 
                        alias_name = alias_name, # 别名
                        positioning = positioning, #定位
                        company_name = company_name, # 公司名称
                        project_id = project_id, #项目编号
                        counts = counts, # 套数
                        area = area, # 面积
                        marketable_area = marketable_area, # 可销售面积
                        sales_area = sales_area, #已销售面积
                        has_sold_area = has_sold_area, #已销售非住宅面积
                        number_sellable_households = number_sellable_households, # 可售户数
                        has_sold_number = has_sold_number, # 已销售户数
                        has_sold_households = has_sold_households, # 已销售非住宅户数
                        permit_number = permit_number, #    许可证号
                        permission_date = permission_date, # 许可日期
                        sales_address = sales_address, # 售楼地址
                        sales_call = sales_call, # 售楼电话
                        number_buildings = number_buildings, # 幢数
                        construction_area = construction_area, # 建筑面积
                        opening_time = opening_time, # 开盘时间
                        supervision_account = supervision_account, # 资金监管账户
                        document_authority = document_authority, # 证件发布机构
                        financial_bank = financial_bank, # 资金监管银行
                        bulding = self.bild
                        ))
                self.logger.info("解析完成")
                self.status= CRAWL_SUCCESS
                self.result.append(self.GJJInfo)
            else:
                self.logger.info("暂无您查询的信息:%s" % IDCARD_ERROR_DESC)
                self.status= CRAWL_SUCCESS
                self.result.append(self.GJJInfo)

        except:
            s = traceback.format_exc()
            self.logger.error("抓取错误:%s" % s)
            self.status, self.desc = CRAWL_FAIL, PROGRAM_ERROR_DESC
        finally:
                try:
                    if len(self.result) == 1 and self.status == CRAWL_SUCCESS:
                        self.desc = CRAWL_SUCCESS_DESC
                        # print self.result
                        result_json = json.dumps(self.result[0], ensure_ascii=False)
                        # print result_json
                        self.redisUtils.setNotify(type=TYPEVALUE,token=self.token, val="1", decs="抓取成功!", result=result_json)
                        # self.push_data(TYPEVALUE, self.userid, result_json)

                    elif self.status == CRAWL_FAIL:
                        self.desc = CRAWL_FAIL_DESC

                    elif self.status == CRAWL_TIMEOUT:
                        self.desc = CRAWL_TIMEOUT_DESC

                    elif self.status == IDCARD_ERROR:
                        self.desc = IDCARD_ERROR_DESC

                    elif self.status == PASSWORD_ERROR:
                        self.desc = PASSWORD_ERROR_DESC

                    elif self.status == BCODE_ERROR:
                        self.desc = BCODE_ERROR_DESC

                    else:
                        self.desc = PROGRAM_ERROR_DESC

                except Exception as e:
                    s = traceback.format_exc()
                    self.logger.error(s)

                finally:
                    try:
                        self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc)
                    except Exception:
                        s = traceback.format_exc()
                        self.logger.error(s)


    def zipToStr(self, content):
        '''
        使用urllib2获取到的内容被压缩,需要进行解压缩
        :param content: 需要解压的内容
        :return:
        '''
        try:
            conn = zlib.decompress(content, 16 + zlib.MAX_WBITS)
            return conn
        except:
            self.logger.error('解压缩响应内容出错%s' % traceback.format_exc())
            raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
Example #13
0
class CrawlBase(object):
    def __init__(self, dict_json, key=[], verifycode_type='png'):
        self.redisUtils = RedisUtils()
        self.damatuWeb = damatuWeb
        self.PROXYADDR = PROXYADDR
        self.dict_json = dict_json
        self.token = self.dict_json['token']
        self.verifycode_type = verifycode_type
        self.status = None
        self.desc = None

        self.current_milli_time = lambda: str(int(round(time.time() * 1000)))
        self.startTime = self.current_milli_time()

        self.realpath = os.path.split(os.path.realpath(__file__))[0]
        filename = 'verifycode/%s_verifycode.%s' % (self.startTime,
                                                    verifycode_type)
        self.code_path = os.path.join(self.realpath, filename)

        logging.config.fileConfig('unicom/logging.config')
        self.logger = logging.getLogger('flow')

        # 是否保存用户目录,默认为爬虫出错才保存,也可以手动修改
        self.rmuserdirFlag = False
        self.mkUserdir(key)

    '''  用户信息以及爬取页面记录,爬取成功会删除,失败则会保存下来便于排错 '''

    def mkUserdir(self, key=[]):
        # 创建用户文件夹
        fn = os.path.join(self.realpath, 'sample',
                          '%s_%s' % (self.startTime, self.dict_json['token']))
        os.mkdir(fn)
        # 覆盖敏感信息
        info_dict = copy.deepcopy(self.dict_json)
        for k in key:
            info_dict[k] = u'******'
        with open(os.path.join(fn, 'dict_json.txt'), 'w') as f:
            f.write(str(info_dict))
        self.userdir = fn
        return

    def rmUserdir(self, fn):
        shutil.rmtree(fn)

    def takePage(self, n, content, msg=None):
        fn = os.path.join(self.userdir, n)
        with open(fn, 'w') as f:
            f.write(content)
        if msg:
            with open(fn, 'a') as f:
                f.write('\n' * 5 + '#' * 60 + '\n' * 3 + msg)

    ''' 验证码交互相关 '''

    def get_verifycode(self, codeUrl=None):
        # 获取图片验证码,并发送通知,如果是短信验证码,或者必须使用driver,可重写此函数
        if callable(codeUrl):
            codeUrl = codeUrl()
        codeContent = self.session.get(codeUrl).content
        bcode = base64.b64encode(codeContent)
        self.redisUtils.setNotify(token=self.token,
                                  val=NEED_MORE,
                                  decs='需要图片验证码',
                                  result='data:image/jpg;base64,' + bcode)
        self.logger.info('验证码已发送')

    def judge_verifycode(self, inputValue, ResetCode):
        # verifycode_handler会根据此函数的返回值判断验证码对错
        # 判断验证码对错,爬虫自定义,如果验证码不是错误的,此函数的返回值,将被verifycode_handler()返回
        pass

    def get_input(self):
        # 获取用户输入
        stime = datetime.now()
        self.logger.info('等待用户输入')
        while True:
            inputValue = self.redisUtils.getNotify(self.token, 'bcode')
            if inputValue:
                return inputValue
            else:
                eclipseTimes = datetime.now() - stime
                if eclipseTimes.total_seconds() > WAITTIME:
                    self.logger.info('接收用输入超时:%s' % self.token)
                    self.status = INPUT_ERROR
                    self.desc = '接收用输入超时'
                    time.sleep(1)
                    return

    def verifycode_handler(self, codeUrl=None, ResetCode=False):
        # 交互流程,接受用户响应,可以选择刷新验证码,或者输入验证码
        # ResetCode 可控制是否保持会话,开启验证码刷新功能,默认不保持会话
        self.logger.info('需要验证码')
        self.get_verifycode(codeUrl)
        while True:
            inputValue = self.get_input()
            if inputValue == 'reset':
                if ResetCode:
                    self.logger.info('用户刷新验证码')
                    self.redisUtils.DelNotify(self.token, 'bcode')
                    self.get_verifycode(codeUrl)
                    continue
            elif inputValue == None:
                return
            else:
                # 验证码输入是否正确
                result = self.judge_verifycode(inputValue, ResetCode)
                if result:
                    return result
                else:
                    if ResetCode:
                        self.redisUtils.DelNotify(self.token, 'bcode')
                        self.redisUtils.setNotify(token=self.token,
                                                  val=INPUT_ERROR,
                                                  decs='验证码错误')
                        self.logger.info('验证码错误')
                        continue
                    self.status = INPUT_ERROR
                    self.desc = '验证码错误'
                    return
Example #14
0
class SpiderMain(craw_dishonest):

    logger = logging.getLogger()

    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.username = urllib.quote(user.get("name", ""))
        self.idcard = urllib.quote(user.get("idcard", ""))
        self.area = urllib.quote(user.get("area", ""))
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        # urllib.quote(a)

        self.LoginUrl = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?" + "resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=" + self.idcard + "&iname=" + self.username + "&areaName=" + self.area + "&ie=utf-8&oe=utf-8&format=json&t=" + str(
            int(round(time.time() *
                      1000))) + "&cb=jQuery110207690611877233657_" + str(
                          int(round(time.time() * 1000))) + "&_=" + str(
                              int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}
        self.PayRecord = {}

        # 加入当次代理
        # self.proxy = self._proxy()

    def _proxy(self):
        proxy = self.session.get(self.PROXYADDR).content
        return {"http": "http://" + proxy, "https": "http://" + proxy}

    def _errhtmlRecord(self, content):
        '''
        错误页面保存
        '''
        self.logger.info("保存错页内容")
        try:
            filename = str(uuid.uuid1()) + ".html"
            sampleDir = os.path.join(os.path.dirname(__file__),
                                     "errorHtml").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存错页内容到{0}".format(filename))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存错页出错")
            self.logger.warn("{0}".format(s))

    def _sampleRecord(self, filename, content):
        '''
        保存网页内容
        '''
        self.logger.info("保存网页内容")
        try:
            sampleDir = os.path.join(os.path.dirname(__file__),
                                     "sample/").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存网页内容到{0}".format(sampleDir))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存网页出错")
            self.logger.warn("{0}".format(s))

    def _fetchUrl(self,
                  url,
                  data=None,
                  header=None,
                  timeout=TIMEOUT,
                  fileName=None,
                  proxy=None):
        '''
        抓取方法
        '''
        self.logger.info("开始抓取 {0}".format(url))
        if header:
            headers = header
            self.logger.debug("伪装头:{0}".format(headers))
        else:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
            }
            self.logger.debug("伪装头:{0}".format(headers))
        for ir in range(REQUEST_RETRY):
            try:
                self.logger.debug("第{0}次 抓取".format(ir))
                if data:
                    if proxy:
                        content = self.session.post(url,
                                                    data=data,
                                                    headers=headers,
                                                    timeout=timeout,
                                                    allow_redirects=False,
                                                    proxies=proxy)
                        self.logger.debug(
                            "POST url:{0}, data:{1}, proxy: {2}".format(
                                url, data, proxy))
                    else:
                        content = self.session.post(url,
                                                    data=data,
                                                    headers=headers,
                                                    timeout=timeout,
                                                    allow_redirects=False)
                        self.logger.debug("POST url:{0}, data:{1}".format(
                            url, data))
                else:
                    if proxy:
                        content = self.session.get(url,
                                                   headers=headers,
                                                   timeout=timeout,
                                                   allow_redirects=False,
                                                   proxies=proxy)
                        self.logger.debug("Get url:{0}, proxy: {1}".format(
                            url, proxy))
                    else:
                        content = self.session.get(url,
                                                   headers=headers,
                                                   timeout=timeout,
                                                   allow_redirects=False)
                        self.logger.debug("Get url:{0}".format(url))
                if fileName and SAMPLEFLAG:
                    self._sampleRecord(fileName, content.content)
                return content
            except:
                self.logger.error(traceback.format_exc())
        self.logger.error("request url {0} failed ,check pls".format(url))
        self.status = CRAWL_TIMEOUT
        raise Exception("Failed to load url (%s)" % url)

    def _save_captcha(self):
        """
        下载验证码,返回图片b64编码,
        """
        self.logger.info("刷新验证码")
        try:
            codeContent = self.session.get(self.codeUrl,
                                           headers=IMGHEADERS).content
            self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50])
            self.logger.info("下载验证码")
            self.status = NEED_BCODE
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 "captcha.png").replace("\\", "/"), 'wb') as f:
                f.write(codeContent)
            self.logger.info("验证码图片已保存!")
            bcode = base64.b64encode(codeContent)
            # self.logger.debug("{}".format(bcode))
            return bcode
        except:
            s = traceback.format_exc()
            self.logger.error("刷新验证码错误:%s" % s)
            return PROGRAM_ERROR, {"error": "超时或代码异常"}

    def _captcha_recognize(self, imgpath):
        '''
        自动识别验证码
        :param fileName:
        :return:
        '''
        img = Image.open(imgpath)
        for i in range(10):
            code = image_to_string(img, lang='eng').encode('utf-8')
            if code.isalnum() and len(code) == 4:
                self.logger.info(code)
                return code
            self._save_captcha()
            time.sleep(0.05)

    def _ChioceIdent(self, flag):
        '''
        选择识别方式
        :param flag:
        :return:
        '''
        if flag == 'dmt':
            self._save_captcha()
            self.startTime = str(datetime.now())
            dmt = damatuWeb.DamatuApi("huoyan2016", "123456")
            # self.imageCode = dmt.decodeUrl(self.captchaId_url, 200)
            self.imageCode = dmt.decode(
                os.path.join(os.path.dirname(__file__),
                             "captcha.png").replace("\\", "/"), 200)
            self.finishTime = str(datetime.now())
        elif flag == 'input':
            self._save_captcha()
            pngPath = os.path.join(os.path.dirname(__file__),
                                   "captcha.png").replace("\\", "/")
            self.logger.info("验证码路径:{0}".format(pngPath))
            self.imageCode = raw_input("请输入验证码:")
        elif flag == 'auto':
            self.startTime = str(datetime.now())
            self._save_captcha()
            self.logger.info("识别验证码")
            pngPath = os.path.join(os.path.dirname(__file__),
                                   "captcha.png").replace("\\", "/")
            self.imageCode = self._captcha_recognize(pngPath)
            self.logger.debug("验证码内容:{0}".format(self.imageCode))
            self.finishTime = str(datetime.now())
        # 返回给用户 通知redis 返回base64
        elif flag == 'user':
            self.startTime = datetime.now()
            bcode64 = self._save_captcha()
            self.redisUtils.setNotify(token=self.token,
                                      val="10",
                                      decs="需要图片验证码",
                                      result="data:image/jpg;base64," +
                                      bcode64)
            # 向session中放入数据
            while True:
                # 等待获取用户输入要的图片验证码值
                dict_image_code = self.redisUtils.getNotify(
                    self.token, "bcode")
                if dict_image_code is not None:

                    self.imageCode = dict_image_code
                    return
                else:
                    self.finishTime = datetime.now()
                    if abs(self.finishTime.minute -
                           self.startTime.minute) >= 3:
                        break
                    # 爬虫等待用户输入图片验证码超时
                    self.logger.warn("爬虫等待用户输入图片验证码超时:%s" % self.token)
                    time.sleep(1)
        else:
            self.status = NOT_NEED_BCODE
            self.logger.info(NOT_NEED_BCODE_DESC)

    def login(self, flag):
        # self._ChioceIdent(flag)
        if self.username or self.idcard:
            content = self._fetchUrl(url=self.LoginUrl,
                                     header=LOGINHEADERS,
                                     fileName="login.html")
            return content
        else:
            return ''

    @Time()
    def crawl(self, flag=""):
        CurTime = datetime.now().strftime("%Y-%m-%d")
        PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d")
        try:
            # login
            # for i in range(10):
            content = self.login(flag)
            if content:
                info_re = re.compile(r'\/\*\*\/jQuery[0-9]+_[0-9]+\((.*)\)')
                info_detail = ifNotEmptyGetIndex(
                    info_re.findall(content.content))
                info_dict = eval(info_detail)
                date_info = ifNotEmptyGetIndex(info_dict.get('data', ''))
                if not date_info:
                    date_info = {'result': ''}
                info_result = date_info.get('result', '')
                for detail in info_result:
                    re_id = re.compile(r'id=([0-9]*)')
                    shixinid = ifNotEmptyGetIndex(
                        re_id.findall(detail.get("loc", "")))
                    self.GJJInfo.append(
                        craw_dishonest.gaofa(
                            unperformPart=detail.get("unperformPart",
                                                     ""),  # 被执行人的未履行部分
                            shixinid=shixinid,  # 失信人ID
                            sexy=detail.get("sexy", ""),  # 性别
                            regDate=detail.get("regDate", ""),  # 立案时间
                            publishDate=detail.get("publishDate", ""),  # 发布时间
                            performedPart=detail.get("performedPart",
                                                     ""),  # 被执行人的履行部分
                            performance=detail.get("performance",
                                                   ""),  # 被执行人的履行情况
                            partyTypeName=detail.get("partyTypeName",
                                                     ""),  # 类型号
                            iname=detail.get("iname", ""),  # 被执行人姓名/名称
                            disruptTypeName=detail.get("disruptTypeName",
                                                       ""),  # 失信被执行人行为具体情形
                            courtName=detail.get("courtName", ""),  # 执行法院
                            caseCode=detail.get("caseCode", ""),  # 案号
                            cardNum=detail.get("cardNum", ""),  # 身份证号码/组织机构代码
                            businessEntity=detail.get("businessEntity",
                                                      ""),  # 法定代表人或负责人姓名
                            areaName=detail.get("areaName", ""),  # 省份
                            age=detail.get("age", ""),  # 年龄(企业默认为0)
                            duty=detail.get("duty", ""),  # 生效法律文书确定的义务
                            gistId=detail.get("gistId", ""),  # 执行依据文号
                            gistUnit=detail.get("gistUnit", ""),  # 做出执行依据单位
                        ))
                self.status = CRAWL_SUCCESS
                self.result.append(self.GJJInfo)
            else:
                self.status = CRAWL_SUCCESS
                self.result.append(self.GJJInfo)
        except:
            s = traceback.format_exc()
            self.logger.error("抓取错误:%s" % s)
            self.status, self.desc = EXEMPLE_IS_NOT_FULL, EXEMPLE_IS_NOT_FULL_DESC
        finally:
            try:
                if len(self.result) == 1 and self.status == CRAWL_SUCCESS:
                    self.desc = CRAWL_SUCCESS_DESC
                    # print self.result
                    result_json = json.dumps(self.result[0],
                                             ensure_ascii=False)
                    # print result_json
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val="1",
                                              decs="抓取成功!",
                                              result=result_json)
                    # self.push_data(TYPEVALUE, self.userid, result_json)

                elif self.status == CRAWL_FAIL:
                    self.desc = CRAWL_FAIL_DESC

                elif self.status == CRAWL_TIMEOUT:
                    self.desc = CRAWL_TIMEOUT_DESC

                elif self.status == IDCARD_ERROR:
                    self.desc = IDCARD_ERROR_DESC

                elif self.status == PASSWORD_ERROR:
                    self.desc = PASSWORD_ERROR_DESC

                elif self.status == BCODE_ERROR:
                    self.desc = BCODE_ERROR_DESC

                else:
                    self.desc = PROGRAM_ERROR_DESC

            except Exception as e:
                s = traceback.format_exc()
                self.logger.error(s)

            finally:
                try:
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val=self.status,
                                              decs=self.desc)
                except Exception:
                    s = traceback.format_exc()
                    self.logger.error(s)

    def zipToStr(self, content):
        '''
        使用urllib2获取到的内容被压缩,需要进行解压缩
        :param content: 需要解压的内容
        :return:
        '''
        try:
            conn = zlib.decompress(content, 16 + zlib.MAX_WBITS)
            return conn
        except:
            self.logger.error('解压缩响应内容出错%s' % traceback.format_exc())
            raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
Example #15
0
class SpiderMain(craw_taxpayer_qualification):

    logger = logging.getLogger()

    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.fpjy = user.get("fpjy", "")
        self.fpdm = user.get("fpdm", "")
        self.fphm = user.get("fphm", "")
        self.kprq = user.get("kprq", "")
        self.fpje = user.get("fpje", "")
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        self.fpdm_area = self.fpdm[0:4]
        self.fpdm_url = AREA.get(self.fpdm_area, "")
        self.suiji = str(int(round(time.time() * 1000)))

        self.codeUrl = self.fpdm_url + '/WebQuery/yzmQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&r=' + str(
            '%.16f' %
            (random.random())) + '&v=V1.0.04_001' + '&nowtime=' + str(
                int(round(time.time() * 1000))
            ) + '&publickey=B8EE27C2CFEABABBD1DB92F4D84E4EA3&_=' + str(
                int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}

        # 加入当次代理
        # self.proxy = self._proxy()

    def _proxy(self):
        proxy = self.session.get(self.PROXYADDR).content
        return {"http": "http://" + proxy, "https": "http://" + proxy}

    def _errhtmlRecord(self, content):
        '''
        错误页面保存
        '''
        self.logger.info("保存错页内容")
        try:
            filename = str(uuid.uuid1()) + ".html"
            sampleDir = os.path.join(os.path.dirname(__file__),
                                     "errorHtml").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存错页内容到{0}".format(filename))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存错页出错")
            self.logger.warn("{0}".format(s))

    def _sampleRecord(self, filename, content):
        '''
        保存网页内容
        '''
        self.logger.info("保存网页内容")
        try:
            sampleDir = os.path.join(os.path.dirname(__file__),
                                     "sample/").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存网页内容到{0}".format(sampleDir))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存网页出错")
            self.logger.warn("{0}".format(s))

    def _fetchUrl(self,
                  url,
                  data=None,
                  header=None,
                  timeout=TIMEOUT,
                  fileName=None,
                  proxy=None):
        '''
        抓取方法
        '''
        self.logger.info("开始抓取 {0}".format(url))
        if header:
            headers = header
            self.logger.debug("伪装头:{0}".format(headers))
        else:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
            }
            self.logger.debug("伪装头:{0}".format(headers))
        for ir in range(REQUEST_RETRY):
            try:
                self.logger.debug("第{0}次 抓取".format(ir))
                if data:
                    if proxy:
                        content = self.session.post(url,
                                                    data=data,
                                                    headers=headers,
                                                    timeout=timeout,
                                                    allow_redirects=False,
                                                    proxies=proxy)
                        self.logger.debug(
                            "POST url:{0}, data:{1}, proxy: {2}".format(
                                url, data, proxy))
                    else:
                        content = self.session.post(url,
                                                    data=data,
                                                    headers=headers,
                                                    timeout=timeout,
                                                    allow_redirects=False)
                        self.logger.debug("POST url:{0}, data:{1}".format(
                            url, data))
                else:
                    if proxy:
                        content = self.session.get(url,
                                                   headers=headers,
                                                   timeout=timeout,
                                                   allow_redirects=False,
                                                   proxies=proxy)
                        self.logger.debug("POST url:{0}, proxy: {1}".format(
                            url, proxy))
                    else:
                        content = self.session.get(url,
                                                   data=data,
                                                   headers=headers,
                                                   timeout=timeout,
                                                   allow_redirects=False)
                        self.logger.debug("Get url:{0}".format(url))
                if fileName and SAMPLEFLAG:
                    self._sampleRecord(fileName, content.content)
                info = re.compile(r'jQuery[0-9]+_[0-9]+\((.*)\)')

                info_list = info.findall(content.content)[0]
                content_dic = eval(info_list)
                return content_dic
            except:
                self.logger.error(traceback.format_exc())
        self.logger.error("request url {0} failed ,check pls".format(url))
        self.status = CRAWL_TIMEOUT
        raise Exception("Failed to load url (%s)" % url)

    def _save_captcha(self):
        """
        下载验证码,返回图片b64编码,
        """
        self.logger.info("刷新验证码")
        try:
            content = self.session.get(self.codeUrl,
                                       headers=IMGHEADERS,
                                       verify=False)
            info = re.compile(r'jQuery[0-9]+_[0-9]+\((.*)\)')
            info_list = ifNotEmptyGetIndex(info.findall(content.content))
            dic = eval(info_list)

            codeContent = dic.get('key1', '')
            self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50])
            self.logger.info("下载验证码")
            self.status = NEED_BCODE
            chose_id = dic.get('key4', '')
            chose_info = COULOR.get(chose_id, '')
            codeContent1 = base64.b64decode(codeContent)
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 "captcha.png").replace("\\", "/"), 'wb') as f:
                f.write(codeContent1)
            bcode = codeContent
            self.logger.info("验证码图片已保存!")

            if chose_info != '请输入验证码内容':
                im = Image.open(
                    os.path.join(os.path.dirname(__file__), "captcha.png"))
                box = im.copy()
                u = Image.new('RGB', (90, 55))
                u.paste(box, (0, 0))
                key_id = Image.open(
                    os.path.join(os.path.dirname(__file__), chose_info))
                key_box = key_id.copy()
                u.paste(key_box, (0, 35))
                u.save(os.path.join(os.path.dirname(__file__), "card.png"))
                with open(os.path.join(os.path.dirname(__file__),
                                       "card.png")) as c:
                    bcode = base64.b64encode(c.read())

            self.data = urllib.quote(dic.get('key2',
                                             '').encode('utf8')).replace(
                                                 '%20', '+')
            self.index = dic.get('key3', '')

            return bcode

        except:
            s = traceback.format_exc()
            self.logger.error("刷新验证码错误:%s" % s)
            return PROGRAM_ERROR, {"error": "超时或代码异常"}

    def _captcha_recognize(self, imgpath):
        '''
        自动识别验证码
        :param fileName:
        :return:
        '''
        img = Image.open(imgpath)
        for i in range(10):
            code = image_to_string(img, lang='eng').encode('utf-8')
            if code.isalnum() and len(code) == 4:
                self.logger.info(code)
                return code
            self._save_captcha()
            time.sleep(0.05)

    def _ChioceIdent(self, flag):
        '''
        选择识别方式
        :param flag:
        :return:
        '''

        if flag == 'dmt':
            self._save_captcha()
            self.startTime = str(datetime.now())
            dmt = damatuWeb.DamatuApi("huoyan2016", "123456")
            self.imageCode = dmt.decode(
                os.path.join(os.path.dirname(__file__),
                             "captcha.png").replace("\\", "/"), 200)
            self.finishTime = str(datetime.now())
        elif flag == 'input':
            self._save_captcha()
            pngPath = os.path.join(os.path.dirname(__file__),
                                   "captcha.png").replace("\\", "/")
            self.logger.info("验证码路径:{0}".format(pngPath))
            self.imageCode = raw_input("请输入验证码:")
        elif flag == 'auto':
            self.startTime = str(datetime.now())
            self._save_captcha()
            self.logger.info("识别验证码")
            pngPath = os.path.join(os.path.dirname(__file__),
                                   "captcha.png").replace("\\", "/")
            self.imageCode = self._captcha_recognize(pngPath)
            self.logger.debug("验证码内容:{0}".format(self.imageCode))
            self.finishTime = str(datetime.now())
        # 返回给用户 通知redis 返回base64
        elif flag == 'user':
            self.startTime = datetime.now()
            bcode64 = self._save_captcha()
            self.redisUtils.setNotify(token=self.token,
                                      val="10",
                                      decs="需要图片验证码",
                                      result="data:image/jpg;base64," +
                                      bcode64)
            # 向session中放入数据
            while True:
                # 等待获取用户输入要的图片验证码值
                dict_image_code = self.redisUtils.getNotify(
                    self.token, "bcode")
                # print dict_image_code
                if dict_image_code is not None:
                    # print dict_image_code
                    self.redisUtils.DelNotify(self.token, "bcode")
                    self.imageCode = dict_image_code
                    break
                else:
                    self.finishTime = datetime.now()
                    if abs(self.finishTime.minute -
                           self.startTime.minute) >= 2:
                        break
                    time.sleep(1)
        else:
            self.status = NOT_NEED_BCODE
            self.logger.info(NOT_NEED_BCODE_DESC)

    def login(self, flag):
        self._ChioceIdent(flag)
        if self.fpje:
            # LoginUrl = self.fpdm_url+'/WebQuery/query?callback=jQuery110204713398352365614_'+self.suiji+'&fpdm='+self.fpdm+'&fphm='+self.fphm+'&kprq='+self.kprq+'&fpje='+self.fpje+'&fplx=01&yzm='+self.imageCode+'&yzmSj='+self.data+'&index='+self.index+'&iv=31205b0a9543d0cf808f6a3a19915858'+'&salt=bc1792b6b19a7ceb8f124fc75e658cfe'+'&publickey=89FF3E78F5B40654133317B104D81634&_='+str(int(round(time.time() * 1000)))
            LoginUrl = self.fpdm_url + '/WebQuery/invQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&fphm=' + self.fphm + '&kprq=' + self.kprq + '&fpje=' + self.fpje + '&fplx=01&yzm=' + self.imageCode + '&yzmSj=' + self.data + '&index=' + self.index + '&iv=31205b0a9543d0cf808f6a3a19915858' + '&salt=bc1792b6b19a7ceb8f124fc75e658cfe' + '&publickey=89FF3E78F5B40654133317B104D81634&_=' + str(
                int(round(time.time() * 1000)))
            content = self._fetchUrl(url=LoginUrl,
                                     header=IMGHEADERS,
                                     fileName="login.html")

        elif self.fpjy:
            LoginUrl = self.fpdm_url + '/WebQuery/invQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&fphm=' + self.fphm + '&kprq=' + self.kprq + '&fpje=' + self.fpjy + '&fplx=04&yzm=' + self.imageCode + '&yzmSj=' + self.data + '&index=' + self.index + '&iv=31205b0a9543d0cf808f6a3a19915858' + '&salt=bc1792b6b19a7ceb8f124fc75e658cfe' + '&publickey=89FF3E78F5B40654133317B104D81634&_=' + str(
                int(round(time.time() * 1000)))
            content = self._fetchUrl(url=LoginUrl,
                                     header=IMGHEADERS,
                                     fileName="login.html")

        else:
            self.logger.debug("没有您查询的方式")
            content = {"key1": "009"}

        return content

    @Time()
    def crawl(self, flag=""):
        CurTime = datetime.now().strftime("%Y-%m-%d")
        PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d")
        try:
            # login
            for i in range(5):
                content = self.login(flag)
                if content['key1'] == "007":
                    self.logger.info("验证码失效")
                    self.status, self.desc = BCODE_ERROR, CARD_OUT_DESC
                    continue

                elif content['key1'] == "008":
                    self.logger.info("验证码错误")
                    self.status, self.desc = BCODE_ERROR, BCODE_ERROR_DESC
                    continue

                elif content["key1"] == "002":
                    self.logger.info("当日查询次数已超过5次")
                    self.logger.info(PASSWORD_ERROR_DESC)
                    self.status, self.desc = PASSWORD_ERROR, PASSWORD_ERROR_DESC
                    break
                elif content['key1'] == "009":
                    self.logger.info("查无此票")
                    self.logger.info(EXEMPLE_IS_NOT_FULL)
                    self.status, self.desc = EXEMPLE_IS_NOT_FULL, EXEMPLE_IS_NOT_FULL_DESC
                    break

                elif content["key1"] == "001":
                    self.logger.info("登陆成功:%s" % content['key1'])
                    a_json = json.dumps(content, ensure_ascii=False)
                    bbb = json.loads(a_json, encoding="gbk")

                    sales_name = bbb.get(
                        'key2', '').encode('utf8').split('≡')[6]  # 销售方名字【6】
                    purchaser_taxpayer_id = bbb.get(
                        'key2',
                        '').encode('utf8').split('≡')[3]  # 购买方纳税人识别号【3】
                    purchaser_bank_account = bbb.get(
                        'key2',
                        '').encode('utf8').split('≡')[5]  # 购买方开户行级账号【5】
                    sales_taxpayer_id = bbb.get(
                        'key2', '').encode('utf8').split('≡')[7]  # 销售方纳税识别号【7】
                    sales_add_phone = bbb.get(
                        'key2', '').encode('utf8').split('≡')[8]  # 销售方地址电话【8】
                    check_number = bbb.get(
                        'key2', '').encode('utf8').split('≡')[17]  # 校验号
                    sales_bank_account = bbb.get(
                        'key2',
                        '').encode('utf8').split('≡')[9]  # 销售方开户行及账号【9】
                    purchaser_add_phone = bbb.get(
                        'key2', '').encode('utf8').split('≡')[4]  # 购买方地址电话[4]
                    purchaser_name = bbb.get(
                        'key2', '').encode('utf8').split('≡')[2]  # 购买方名称[2]

                    service_name = bbb.get(
                        'key3', '').encode('utf8').split('█')[0]  #  服务名称
                    specification = bbb.get(
                        'key3', '').encode('utf8').split('█')[1]  # 规格型号
                    unit = bbb.get('key3',
                                   '').encode('utf8').split('█')[2]  # 单位
                    quantity = bbb.get('key3',
                                       '').encode('utf8').split('█')[3]  # 数量
                    unit_price = bbb.get('key3',
                                         '').encode('utf8').split('█')[4]  # 单价
                    amount = bbb.get('key3',
                                     '').encode('utf8').split('█')[5]  # 金额
                    tax_rate = bbb.get('key3',
                                       '').encode('utf8').split('█')[6]  #  税率
                    tax = bbb.get('key3',
                                  '').encode('utf8').split('█')[7]  # 税额

                    if self.fpjy:
                        machine_code = bbb.get(
                            'key2', '').encode('utf8').split('≡')[15]  #机器码
                    else:
                        machine_code = ''

                    self.PerInfo = craw_taxpayer_qualification.baseinfo(
                        sales_name=sales_name,
                        purchaser_taxpayer_id=purchaser_taxpayer_id,
                        purchaser_bank_account=purchaser_bank_account,
                        sales_taxpayer_id=sales_taxpayer_id,
                        sales_add_phone=sales_add_phone,
                        sales_bank_account=sales_bank_account,
                        purchaser_add_phone=purchaser_add_phone,
                        purchaser_name=purchaser_name,
                        service_name=service_name,
                        specification=specification,
                        unit=unit,
                        quantity=quantity,
                        unit_price=unit_price,
                        amount=amount,
                        tax_rate=tax_rate,
                        tax=tax,
                        invoice_code=self.fpdm,  # 发票代码
                        invoice_number=self.fphm,  # 发票号码
                        billing_date=self.kprq,  # 开票日期
                        check_number=check_number,  # 校验号
                        machine_code=machine_code,  # 机器码
                        before_tax=self.fpje,  #税前金额
                        total_tax='%.2f' %
                        (float(amount) + float(tax)),  # 纳税合计
                    )

                    self.GJJInfo.append(self.PerInfo)
                    self.status = CRAWL_SUCCESS
                    self.result.append(self.GJJInfo)
                    break
                else:
                    self.logger.info("查询失败:%s" % content["key1"])
                    self.logger.info(IDCARD_ERROR_DESC)
                    self.status, self.desc = IDCARD_ERROR, IDCARD_ERROR_DESC
                    break

        except:
            s = traceback.format_exc()
            self.logger.error("抓取错误:%s" % s)
            self.status, self.desc = self.status, PROGRAM_ERROR_DESC
        finally:
            try:
                if len(self.result) == 1 and self.status == CRAWL_SUCCESS:
                    self.desc = CRAWL_SUCCESS_DESC
                    # print self.result
                    result_json = json.dumps(self.result[0],
                                             ensure_ascii=False)
                    # print result_json
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val="1",
                                              decs="抓取成功!",
                                              result=result_json)
                    # self.push_data(TYPEVALUE, self.userid, result_json)

                elif self.status == CRAWL_FAIL:
                    self.desc = CRAWL_FAIL_DESC

                elif self.status == CRAWL_TIMEOUT:
                    self.desc = CRAWL_TIMEOUT_DESC

                elif self.status == IDCARD_ERROR:
                    self.desc = IDCARD_ERROR_DESC

                elif self.status == PASSWORD_ERROR:
                    self.desc = PASSWORD_ERROR_DESC

                elif self.status == BCODE_ERROR:
                    self.desc = BCODE_ERROR_DESC

                else:
                    self.desc = PROGRAM_ERROR_DESC

            except Exception as e:
                s = traceback.format_exc()
                self.logger.error(s)

            finally:
                try:
                    self.redisUtils.setNotify(type=TYPEVALUE,
                                              token=self.token,
                                              val=self.status,
                                              decs=self.desc)
                except Exception:
                    s = traceback.format_exc()
                    self.logger.error(s)

    def zipToStr(self, content):
        '''
        使用urllib2获取到的内容被压缩,需要进行解压缩
        :param content: 需要解压的内容
        :return:
        '''
        try:
            conn = zlib.decompress(content, 16 + zlib.MAX_WBITS)
            return conn
        except:
            self.logger.error('解压缩响应内容出错%s' % traceback.format_exc())
            raise Exception("解压缩响应内容出错%s" % traceback.format_exc())
Example #16
0
class SpiderMain(hypc_soufun):

    logger = logging.getLogger()

    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.keyword = user.get("keyword", "")
        # self.gjjaccnum = self.username if len(self.username) <= 15 else ""
        # self.pwd = user.get("password", "")
        self.age = FANGAGE.get(user.get('age',''),'') or FANGAGE.get(user.get('year',''), '')
        self.token = user.get("token", "")
        self.flower = LOUCENG.get(user.get('flower',''),'') or LOUCENG.get(user.get('floor',''),'')
        self.hu_type = HUXING.get(user.get('hu_type',''),'') or HUXING.get(user.get('housetype',''),'')
        # self.userid = user.get("userid", "")

        self.startUrl = "http://esf.nb.fang.com/NewSecond/sale_info/searchlist_new2014.aspx"
        self.hostUrl = "http://esf.nb.fang.com/"
        self.result = user.get("result", "")
        self.GJJInfo = []

        # self.proxy = {'http':'http://143.0.188.8:80','https':'https://143.0.188.8:80'}
        # 加入当次代理
        # self.proxy = self._proxy()

    def _proxy(self):
        proxy = self.session.get(self.PROXYADDR).content
        # proxy = self.session.get('http://192.168.30.185:13579/ip').content
        return {"http": "http://" + proxy, "https": "http://" + proxy}

    def ifNotEmptyGetIndex(self,somelist,index=0):
        """check to see it's not empty"""
        if somelist: 
            return somelist[index]
        else:
            return ''

    def _errhtmlRecord(self, content):
        '''
        错误页面保存
        '''
        self.logger.info("保存错页内容")
        try:
            filename = str(uuid.uuid1()) + ".html"
            sampleDir = os.path.join(os.path.dirname(__file__), "errorHtml").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存错页内容到{0}".format(filename))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存错页出错")
            self.logger.warn("{0}".format(s))

    def _sampleRecord(self, filename, content):
        '''
        保存网页内容
        '''
        self.logger.info("保存网页内容")
        try:
            sampleDir = os.path.join(os.path.dirname(__file__), "sample/").replace("\\", "/")
            os.path.exists(sampleDir) or os.mkdir(sampleDir)
            with open("%s/%s" % (sampleDir, filename), 'w') as f:
                f.write(content)
            self.logger.debug("已保存网页内容到{0}".format(sampleDir))
        except Exception:
            self.status = PROGRAM_ERROR
            s = traceback.format_exc()
            self.logger.info("保存网页出错")
            self.logger.warn("{0}".format(s))

    def _fetchUrl(self, url, data=None, header=None, timeout=TIMEOUT, fileName=None, proxy=None):
        '''
        抓取方法
        '''
        self.logger.info("开始抓取 {0}".format(url))
        if header:
            headers = header
            self.logger.debug("伪装头:{0}".format(headers))
        else:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"}
            self.logger.debug("伪装头:{0}".format(headers))
        for ir in range(REQUEST_RETRY):
            try:
                self.logger.debug("第{0}次 抓取".format(ir))
                if data:
                    if proxy:
                        content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy)
                        self.logger.debug("POST url:{0}, data:{1}, proxy: {2}".format(url, data, proxy))
                    else:
                        content = self.session.post(url, data=data, headers=headers, timeout=timeout, allow_redirects=False)
                        self.logger.debug("POST url:{0}, data:{1}".format(url, data))
                else:
                    if proxy:
                        content = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=False, proxies=proxy)
                        self.logger.debug("GET url:{0}, proxy: {1}".format(url, proxy))
                    else:
                        content = self.session.get(url, data=data, headers=headers, timeout=timeout, allow_redirects=False)
                        self.logger.debug("Get url:{0}".format(url))
                if fileName and SAMPLEFLAG:
                    self._sampleRecord(fileName, content.content)
                return content
            except:
                self.logger.error(traceback.format_exc())
        self.logger.error("request url {0} failed ,check pls".format(url))
        self.status = CRAWL_TIMEOUT
        raise Exception("Failed to load url (%s)" % url)

    def _save_captcha(self):
        """
        下载验证码,返回图片b64编码,
        """
        self.logger.info("刷新验证码")
        try:
            codeContent = self.session.get(self.codeUrl, headers=IMGHEADERS).content
            self.logger.debug("验证码二进制内容:{0}".format(codeContent)[:50])
            self.logger.info("下载验证码")
            self.status = NEED_BCODE
            with open(os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 'wb') as f:
                f.write(codeContent)
            self.logger.info("验证码图片已保存!")
            bcode = base64.b64encode(codeContent)
            # self.logger.debug("{}".format(bcode))
            return bcode
        except:
            s = traceback.format_exc()
            self.logger.error("刷新验证码错误:%s" % s)
            return PROGRAM_ERROR, {"error": "超时或代码异常"}

    def _captcha_recognize(self,imgpath):
        '''
        自动识别验证码
        :param fileName:
        :return:
        '''
        img = Image.open(imgpath)
        for i in range(10):
            code = image_to_string(img, lang='eng').encode('utf-8')
            if code.isalnum() and len(code) == 4:
                self.logger.info(code)
                return code
            self._save_captcha()
            time.sleep(0.05)

    def _ChioceIdent(self, flag):
        '''
        选择识别方式
        :param flag:
        :return:
        '''
        if flag == 'dmt':
            self._save_captcha()
            self.startTime = str(datetime.now())
            dmt = damatuWeb.DamatuApi("huoyan2016", "123456")
            # self.imageCode = dmt.decodeUrl(self.captchaId_url, 200)
            self.imageCode = dmt.decode(os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/"), 200)
            self.finishTime = str(datetime.now())
        elif flag == 'input':
            self._save_captcha()
            pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/")
            self.logger.info("验证码路径:{0}".format(pngPath))
            self.imageCode = raw_input("请输入验证码:")
        elif flag == 'auto':
            self.startTime = str(datetime.now())
            self._save_captcha()
            self.logger.info("识别验证码")
            pngPath = os.path.join(os.path.dirname(__file__), "captcha.png").replace("\\", "/")
            self.imageCode = self._captcha_recognize(pngPath)
            self.logger.debug("验证码内容:{0}".format(self.imageCode))
            self.finishTime = str(datetime.now())
        # 返回给用户 通知redis 返回base64
        elif flag == 'user':
            self.startTime = datetime.now()
            bcode64 = self._save_captcha()
            self.redisUtils.setNotify(token=self.token, val="10",decs="需要图片验证码",result="data:image/jpg;base64,"+bcode64)
            # 向session中放入数据
            while True:
                # 等待获取用户输入要的图片验证码值
                dict_image_code = self.redisUtils.getNotify(self.token, "bcode")
                if dict_image_code is not None:

                    self.imageCode = dict_image_code
                    return
                else:
                    self.finishTime = datetime.now()
                    if abs(self.finishTime.minute - self.startTime.minute)>=3:
                        break
                    # 爬虫等待用户输入图片验证码超时
                    self.logger.warn("爬虫等待用户输入图片验证码超时:%s" % self.token)
                    time.sleep(1)
        else:
            self.status = NOT_NEED_BCODE
            self.logger.info(NOT_NEED_BCODE_DESC)


    def prase_detail(self, detail_content):
        detail_info = etree.HTML(detail_content.text)
        sum_price = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='trl-item_top']/div/i/text()"))# 总价
        first_pay = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div/div[@class='trl-item_top']/div[2]/text()"))# 首付
        # month_pay = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[3]/a/div/span/i/text()"))# 月供
        house_type = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[1]/div[1]/text()")).strip()# 户型
        construction_area = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[2]/div[1]/text()"))# 建筑面积
        unit_price = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[2]/div[3]/div[1]/text()"))# 单价
        orientation = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[1]/div[1]/text()"))# 朝向
        floor = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[2]/div[1]/text()"))+self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[2]/div[2]/text()"))# 楼层
        decoration = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[3]/div[3]/div[1]/text()"))# 装修
        district = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='tab-cont-right']/div[4]/div[1]/div[2]/a[1]/text()"))# 小区
        
        quyu = detail_info.xpath("//div[@id='address']/a")# 区域
        quyu_list = []
        for qu in quyu:
            quyu_list.append(self.ifNotEmptyGetIndex(qu.xpath("text()")).strip())
        area = ','.join(quyu_list) # 区域
        
        contact_person = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@id='agantesfxq_C04_02']/text()"))# 联系人
        economic_company = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='tjcont-list-cline2']/span[2]/text()"))# 经纪公司
        phone = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='tjcont-list-cline3 font16']/span/text()"))# 电话
        #房源信息
        build_age_list = re.compile(r'<span class=\"lab\">建筑年代</span>[\s]*<span class=\"rcont\">(.*)</span>')
        build_age = self.ifNotEmptyGetIndex(build_age_list.findall(str(detail_content.text)))
        elevator_list = re.compile(r'<span class=\"lab\">有无电梯</span>[\s]*<span class=\"rcont\">(.*)</span>')
        elevator = self.ifNotEmptyGetIndex(elevator_list.findall(str(detail_content.text)))
        property_right_list = re.compile(r'<span class="lab">产权性质</span>[\s]*<span class="rcont">(.*)</span>')
        property_right = self.ifNotEmptyGetIndex(property_right_list.findall(str(detail_content.text)))
        category_list = re.compile(r'<span class="lab">住宅类别</span>[\s]*<span class="rcont">(.*)</span>')
        category = self.ifNotEmptyGetIndex(category_list.findall(str(detail_content.text)))
        build_structure_list = re.compile(r'<span class="lab">建筑结构</span>[\s]*<span class="rcont">(.*)</span>')
        build_structure = self.ifNotEmptyGetIndex(build_structure_list.findall(str(detail_content.text)))
        build_category_list = re.compile(r'<span class="lab">建筑类别</span>[\s]*<span class="rcont">(.*)</span>')
        build_category = self.ifNotEmptyGetIndex(build_category_list.findall(str(detail_content.text)))
        list_time_list = re.compile(r'<span class="lab">挂牌时间</span>[\s]*<span class="rcont">[\s]*(.*)[\s]*.*</span>')
        list_time = self.ifNotEmptyGetIndex(list_time_list.findall(str(detail_content.text))).strip()
        fang_info = self.ifNotEmptyGetIndex(detail_info.xpath("//div[@class='content-item'][2]/div[2]/div/div/div/text()")) # 房源描述
        # 小区信息
        reference_price = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[1]/span[2]/i/text()"))#参考均价
        district_than_year = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[2]/span[2]/em/span/text()")) # 同比去年
        district_than_month = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div/div[3]/span[2]/em/span/text()")) # 环比上月
        district_property_type = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[1]/span[2]/text()")).strip() # 物业类型 
        district_property_costs = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[2]/span[2]/text()")).strip() # 物业费用
        district_build_type = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[3]/span[2]/text()")).strip() # 建筑类型
        district_build_age = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[4]/span[2]/text()")).strip() # 建筑年代
        district_green_rate = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[5]/span[2]/text()")).strip() # 绿化率
        district_volume_tate = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[6]/span[2]/text()")).strip() # 容积率
        district_diversion = self.ifNotEmptyGetIndex(detail_info.xpath("//*[@class='cont pt30']/div[2]/div[7]/span[2]/text()")).strip() # 人车分流
        self.GJJInfo.append(hypc_soufun.baseinfo(
            sum_price=sum_price,
            first_pay=first_pay,
            house_type=house_type,
            construction_area=construction_area,
            unit_price=unit_price,
            orientation=orientation,
            floor=floor,
            decoration=decoration,
            district=district,
            area=area,
            contact_person=contact_person,
            economic_company=economic_company,
            phone=phone,
            build_age=build_age,
            elevator=elevator,
            property_right=property_right,
            category=category,
            build_structure=build_structure,
            build_category=build_category,
            list_time=list_time,
            fang_info=fang_info,
            reference_price=reference_price,
            district_than_year=district_than_year,
            district_than_month=district_than_month,
            district_property_type=district_property_type,
            district_property_costs=district_property_costs,
            district_build_type=district_build_type,
            district_build_age=district_build_age,
            district_green_rate=district_green_rate,
            district_volume_tate=district_volume_tate,
            district_diversion=district_diversion,
            ))

    def login(self,flag):
        # self._ChioceIdent(flag)
        '''there are divided into bulk and real time'''
        if self.keyword:
            LoginData = {
                'input_keyw1':self.keyword,
                'city':'宁波',
                'district':'',
                'purpose':'סլ',
                'room':'',
                'pricemin':'',
                'pricemax':'',
                'trackLine':'',
                'keyword':self.keyword,
                'renttype':'',
                'strCity':'宁波',
                'strDistrict':'',
                'Strprice':'',
                'StrNameKeyword':self.keyword,
                'houseType':'',
                'isnewhouse':0,
                'isFinder':0,
                'fromdianshang':'',
                'fromhouseprom':'',
                'fromesfchengjiao':''
            }
            # content = self._fetchUrl(url=self.startUrl, header=LOGINHEADERS, proxy=self.proxy, data=LoginData, fileName="login.html")
            content = self._fetchUrl(url=self.startUrl, header=LOGINHEADERS, data=LoginData, fileName="login.html")
            return content
        else:
            # content = self._fetchUrl(url=self.hostUrl, header=PERHEADERS, proxy=self.proxy, fileName="login.html")
            content = self._fetchUrl(url=self.hostUrl, header=PERHEADERS, fileName="login.html")
            return content

    @Time()
    def crawl(self, flag=""):
        CurTime = datetime.now().strftime("%Y-%m-%d")
        PastTime = (datetime.now() - timedelta(days=729)).strftime("%Y-%m-%d")
        try:
            # login
            # for i in range(10):
            content = self.login(flag)
            # try:
            if 'keyword=' not in content.text.encode('utf8') and content.text.encode('utf8'):
                self.logger.info("获取信息成功:%s"%'good info')
                secondUrl = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'kw'+'/'
                # content = self._fetchUrl(url=secondUrl, header=PERHEADERS, proxy=self.proxy, fileName="person.html")
                content = self._fetchUrl(url=secondUrl, header=PERHEADERS, fileName="person.html")
                infohtml = etree.HTML(content.text)
                num_info = str(self.ifNotEmptyGetIndex(infohtml.xpath("//div[@class='fanye gray6']/span/text()")).encode('utf8'))
                if num_info:
                    zong = re.compile(r'共(\d*)页')
                    num = zong.search(num_info).group(1)#提取页数
                    for i in range(int(num)):
                        # proxy1 = self._proxy()
                        fang_url = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'i3'+str(i+1)+'-'+'kw'+'/'
                        # list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, proxy=self.proxy, fileName="list.html")
                        list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, fileName="list.html")
                        list_info = etree.HTML(list_content.text)
                        html = list_info.xpath("//div[@class='houseList']/dl")
                        for ht in html[:-1]:
                            a = self.ifNotEmptyGetIndex(ht.xpath("dd[@class='info rel floatr']/p[1]/a/@href"))
                            if a:
                                detail_url = 'http://esf.nb.fang.com' + str(a)
                                # 具体的房产页信息
                                detail_content = self._fetchUrl(url=detail_url, header=PERHEADERS, fileName="detail.html")
                                self.prase_detail(detail_content)
                    self.status= CRAWL_SUCCESS
                self.result.append(self.GJJInfo)
                # a = json.dumps(self.result[0],ensure_ascii=False) 
                # print a
            # except:
            elif not content.text.encode('utf8'):   
                self.logger.info("获取信息成功:%s"%content)

                # data = urllib.quote(self.keyword.decode(sys.stdin.encoding).encode('gb2312'))
                data = urllib.quote(self.keyword.decode('utf-8').encode('gb2312'))
                secondUrl = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'kw'+data.lower()+'/'
                # content = self._fetchUrl(url=secondUrl, header=PERHEADERS, proxy=self.proxy, fileName="person.html")
                content = self._fetchUrl(url=secondUrl, header=PERHEADERS, fileName="person.html")
                infohtml = etree.HTML(content.text)
                num_info = str(self.ifNotEmptyGetIndex(infohtml.xpath("//div[@class='fanye gray6']/span/text()")).encode('utf8'))
                if num_info:
                    zong = re.compile(r'共(\d*)页')
                    num = zong.search(num_info).group(1)#提取页数
                    for i in range(int(num)):
                        # proxy1 = self._proxy()
                        fang_url = 'http://esf.nb.fang.com/house/'+self.hu_type+self.flower+self.age+'i3'+str(i+1)+'-'+'kw'+data.lower()+'/'
                        # list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, proxy=self.proxy, fileName="list.html")
                        list_content = self._fetchUrl(url=fang_url, header=PERHEADERS, fileName="list.html")
                        list_info = etree.HTML(list_content.text)
                        html = list_info.xpath("//div[@class='houseList']/dl")
                        for ht in html:
                            a = self.ifNotEmptyGetIndex(ht.xpath("dd[@class='info rel floatr']/p[1]/a/@href"))
                            if a:
                                detail_url = 'http://esf.nb.fang.com' + str(a)
                                # 具体的房产页信息
                                detail_content = self._fetchUrl(url=detail_url, header=PERHEADERS, fileName="detail.html")                
                                self.prase_detail(detail_content)
                    self.status= CRAWL_SUCCESS
                else:
                    self.status= CRAWL_SUCCESS
                    self.GJJInfo.append(hypc_soufun.baseinfo())
                self.result.append(self.GJJInfo)
                # a = json.dumps(self.result[0],ensure_ascii=False) 
                # print a
            else:
                self.logger.info("信息失败:%s" % 'bad info')
                self.logger.info(IDCARD_ERROR_DESC)
                self.status, self.desc = IDCARD_ERROR, IDCARD_ERROR_DESC
                self.result.append(self.GJJInfo)
                # break
        except:
            s = traceback.format_exc()
            self.logger.error("抓取错误:%s" % s)
            self.status, self.desc = self.status, PROGRAM_ERROR_DESC
        finally:
                try:
                    if len(self.result) == 1 and self.status == CRAWL_SUCCESS:
                        self.desc = CRAWL_SUCCESS_DESC
                        # print self.result
                        result_json = json.dumps(self.result[0], ensure_ascii=False)
                        # print result_json
                        self.redisUtils.setNotify(type=TYPEVALUE,token=self.token, val="1", decs="抓取成功!", result=result_json)
                        # self.push_data(TYPEVALUE, self.userid, result_json)

                    elif self.status == CRAWL_FAIL:
                        self.desc = CRAWL_FAIL_DESC

                    elif self.status == CRAWL_TIMEOUT:
                        self.desc = CRAWL_TIMEOUT_DESC

                    elif self.status == IDCARD_ERROR:
                        self.desc = IDCARD_ERROR_DESC

                    elif self.status == PASSWORD_ERROR:
                        self.desc = PASSWORD_ERROR_DESC

                    elif self.status == BCODE_ERROR:
                        self.desc = BCODE_ERROR_DESC

                    else:
                        self.desc = PROGRAM_ERROR_DESC

                except Exception as e:
                    s = traceback.format_exc()
                    self.logger.error(s)

                finally:
                    try:
                        self.redisUtils.setNotify(type=TYPEVALUE, token=self.token, val=self.status, decs=self.desc)
                    except Exception:
                        s = traceback.format_exc()
                        self.logger.error(s)


    def zipToStr(self, content):
        '''
        使用urllib2获取到的内容被压缩,需要进行解压缩
        :param content: 需要解压的内容
        :return:
        '''
        try:
            conn = zlib.decompress(content, 16 + zlib.MAX_WBITS)
            return conn
        except:
            self.logger.error('解压缩响应内容出错%s' % traceback.format_exc())
            raise Exception("解压缩响应内容出错%s" % traceback.format_exc())