Ejemplo n.º 1
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            FBSJ_2 = ''
            WZLY_3 = ''
            SYH_4 = ''
            XXFL_5 = ''
            FBJG_6 = ''
            FBRQ_7 = ''
            WH_8 = ''
            SFYX_9 = ''
            XXMC_10 = ''
            ZWBT_11 = ''
            ZDBH_12 = ''
            ZDZMJ_13 = ''
            ZDZL_14 = ''
            SYNX_15 = ''
            CRNX_16 = ''
            RJL_17 = ''
            JZMD_18 = ''
            LDL_19 = ''
            JZXG_20 = ''
            TDYT_21 = ''
            TZQD_22 = ''
            BZJ_23 = ''
            GJBGBAH_24 = ''
            QSJ_25 = ''
            JJFD_26 = ''
            GPKSSJ_27 = ''
            GPJZSJ_28 = ''
            HQCRWJSJ_29 = ''
            HQCRWJDD_30 = ''
            BMSJ_31 = ''
            BMDD_32 = ''
            BZJJZSJ_33 = ''
            QRJMZGSJ_34 = ''
            LXDZ_35 = ''
            LXR_36 = ''
            LXDH_37 = ''
            KHDW_38 = ''
            KHYH_39 = ''
            YHZH_40 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_1 = data.xpath(
                '//div[@class="title"]/h1/text()').extract_first()
            # 发布时间  reFunction('', items)
            FBSJ_2 = reFunction(
                '(\d{4}年\d{2}月\d{2}日 \d{2}:\d{2})\';',
                data.xpath('//div[@class="toolbar"]/script[1]/text()').
                extract_first())
            # 文章来源
            WZLY_3 = reFunction(
                f'document.write\(\'文章来源:([{self.reStr}]*)\'\);',
                data.xpath('//div[@class="toolbar"]/script[2]/text()').
                extract_first())
            # 索引号
            SYH_4 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[1]/span/text()'
            ).extract_first()
            # 信息分类
            XXFL_5 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[2]/span/text()'
            ).extract_first()
            # 发布机构
            FBJG_6 = reFunction(
                f'str_1 = "([{self.reStr}]*)";',
                data.xpath(
                    '//div[@class="xxgk_xl_top"]/ul/li[3]/span/script/text()').
                extract_first())
            # 发文日期
            FBRQ_7 = reFunction(
                f'str_1 = "([{self.reStr}]*)";',
                data.xpath(
                    '//div[@class="xxgk_xl_top"]/ul/li[4]/span/script/text()').
                extract_first())
            # 文号
            WH_8 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[5]/span/text()'
            ).extract_first()
            # 是否有效
            SFYX_9 = reFunction(
                f'var  isok=\'([{self.reStr}]*)\';',
                data.xpath('//div[@class="xxgk_xl_top"]/ul/li[6]/script/text()'
                           ).extract_first())
            # 信息名称
            XXMC_10 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[7]/span/text()'
            ).extract_first()
            # 正文标题
            ZWBT_11 = data.xpath(
                '//tr[@class="firstRow"]/td/text()').extract_first()

            if '主要规划指标' not in items:
                # item_ = reFunction('一、[\s\S]*二、', items)
                for item in [
                        '宗地编号' + _ for _ in re.findall('一([\s\S]*)二', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 联系电话
                    LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s*开户单位',
                                         reFunction('八、[\s\S]*', items))
                    # 宗地编号 / 地块编号
                    ZDBH_12_ = '|'.join(
                        re.findall(
                            f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积',
                            item))
                    ZDBH_12 += '|' + ZDBH_12_ if ZDBH_12_ else '|' + '|'.join(
                        re.findall(
                            f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*',
                            item))
                    # 宗地总面积 / 挂牌面积(m2)
                    ZDZMJ_13_ = '|'.join(
                        re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落',
                                   item))
                    ZDZMJ_13 += '|' + ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join(
                        re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*',
                                   item))
                    # 土地坐落 / 宗地坐落
                    ZDZL_14 += '|' + '|'.join(
                        re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限',
                                   item))
                    # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item))

                    # 岀让年限
                    CRNX_16_ = '|'.join(
                        re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率',
                                   item))
                    CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_)
                    # CRNX_16 += '|' + CRNX_16_ if CRNX_16_ else '|'.join(re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 容积率
                    RJL_17 += '|' + '|'.join(
                        re.findall(
                            f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)',
                            item))
                    # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 建筑密度( %) / 建筑密度
                    JZMD_18 += '|' + '|'.join(
                        re.findall(
                            f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率',
                            item))
                    # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 绿地率 / | 绿化率( %)
                    LDL_19 += '|' + '|'.join(
                        re.findall(
                            f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高',
                            item))
                    # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 建筑限高 / 建筑限高(米)
                    JZXG_20 += '|' + '|'.join(
                        re.findall(
                            f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细',
                            item))
                    # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 土地用途明细 / 土地用途
                    TDYT_21 += '|' + '|'.join(
                        re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度',
                                   item))
                    # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 投资强度
                    TZQD_22 += '|' + '|'.join(
                        re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金',
                                   item))
                    # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 保证金(万元) / 保证金
                    BZJ_23 += '|' + '|'.join(
                        re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号',
                                   item))
                    # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 估价报告备案号
                    GJBGBAH_24_ = '|'.join(
                        re.findall(
                            f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件',
                            item))
                    GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join(
                        re.findall(f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价',
                                   item))
                    GJBGBAH_24 += '|' + reFunction('^\w{10, 16}', GJBGBAH_24__)

                    # 起始价 / 起始价(万元)
                    QSJ_25 += '|' + '|'.join(
                        re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度',
                                   item))
                    # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 加价幅度
                    JJFD_26 += '|' + '|'.join(
                        re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间',
                                   item))
                    # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item))
                    # 挂牌开始时间
                    GPKSSJ_27 += '|' + '|'.join(
                        re.findall(
                            f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间',
                            item))
                    # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 挂牌截止时间
                    GPJZSJ_28 += '|' + '|'.join(
                        re.findall(
                            f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)',
                            item))
                    # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item))
            else:
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                if not table:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二', items)[0].split('宗地编号')[1:]
                    ]:
                        # 联系电话
                        LXDH_37 = reFunction(
                            f'联系电话:\s*([{self.reStr}]*)\s*开户单位',
                            reFunction('八、[\s\S]*', items))
                        # 宗地编号 / 地块编号
                        ZDBH_12_ = '|'.join(
                            re.findall(
                                f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积',
                                item))
                        ZDBH_12__ = ZDBH_12_ if ZDBH_12_ else '|' + '|'.join(
                            re.findall(
                                f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*',
                                item))
                        ZDBH_12 += ZDBH_12__
                        # 宗地总面积 / 挂牌面积(m2)
                        ZDZMJ_13_ = '|'.join(
                            re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落',
                                       item))
                        ZDZMJ_13__ = ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join(
                            re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*',
                                       item))
                        ZDZMJ_13 += ZDZMJ_13__
                        # 土地坐落 / 宗地坐落
                        ZDZL_14 += '|' + '|'.join(
                            re.findall(
                                f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限',
                                item))
                        # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 岀让年限
                        CRNX_16_ = '|'.join(
                            re.findall(
                                f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率',
                                item))
                        CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_)
                        # 容积率
                        RJL_17 += '|' + '|'.join(
                            re.findall(
                                f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)',
                                item))
                        # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 建筑密度( %) / 建筑密度
                        JZMD_18 += '|' + '|'.join(
                            re.findall(
                                f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率',
                                item))
                        # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 绿地率 / | 绿化率( %)
                        LDL_19 += '|' + '|'.join(
                            re.findall(
                                f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高',
                                item))
                        # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 建筑限高 / 建筑限高(米)
                        JZXG_20 += '|' + '|'.join(
                            re.findall(
                                f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细',
                                item))
                        # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 土地用途明细 / 土地用途
                        TDYT_21 += '|' + '|'.join(
                            re.findall(
                                f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度',
                                item))
                        # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 投资强度
                        TZQD_22 += '|' + '|'.join(
                            re.findall(
                                f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金',
                                item))
                        # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 保证金(万元) / 保证金
                        BZJ_23 += '|' + '|'.join(
                            re.findall(
                                f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号',
                                item))
                        # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 估价报告备案号  现状土地条件
                        GJBGBAH_24_ = '|'.join(
                            re.findall(
                                f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件',
                                item))
                        GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join(
                            re.findall(
                                f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价',
                                item))
                        GJBGBAH_24 += '|' + reFunction('^\w{10, 16}',
                                                       GJBGBAH_24__)

                        # 起始价 / 起始价(万元)
                        QSJ_25 += '|' + '|'.join(
                            re.findall(
                                f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度',
                                item))
                        # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 加价幅度
                        JJFD_26 += '|' + '|'.join(
                            re.findall(
                                f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间',
                                item))
                        # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item))
                        # 挂牌开始时间
                        GPKSSJ_27 += '|' + '|'.join(
                            re.findall(
                                f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间',
                                item))
                        # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 挂牌截止时间
                        GPJZSJ_28 += '|' + '|'.join(
                            re.findall(
                                f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)',
                                item))
                        # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item))
                else:
                    # 联系电话
                    LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s',
                                         reFunction('八|七、[\s\S]*', items))
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号 / 地块编号
                    ZDBH_12 = tdData.get('地块编号')
                    # 宗地总面积 / 挂牌面积(m2)
                    ZDZMJ_13 = tdData.get(r'挂牌面积(m2)')
                    # 土地坐落 / 宗地坐落
                    ZDZL_14 = tdData.get('土地坐落')
                    # 使用年限
                    SYNX_15 = tdData.get('使用年限')
                    # 起始价 / 起始价(万元)
                    QSJ_25 = tdData.get('起始价(万元)')
                    # 土地用途明细 / 土地用途
                    TDYT_21 = tdData.get('土地用途')
                    # 保证金(万元) / 保证金
                    BZJ_23 = tdData.get('保证金(万元)')
                    ZYGHZB = tdData.get('主要规划指标')
                    # 容积率
                    RJL_17 = reFunction(
                        '容积率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)
                    # 建筑密度( %) / 建筑密度
                    JZMD_18 = reFunction(
                        '建筑密度[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)容积率',
                        ZYGHZB)
                    # 绿地率 / | 绿化率( %)
                    LDL_19 = reFunction(
                        '绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)
                    # 建筑限高 / 建筑限高(米)
                    JZXG_20 = reFunction(
                        '建筑限高[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)

            # TODO
            # 获取出让文件时间
            HQCRWJSJ_29 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到',
                                     reFunction('四、[\s\S]*五、', items))
            # 获取出让文件地点
            HQCRWJDD_30 = reFunction(
                f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)获取 挂牌',
                reFunction('四、[\s\S]*五、', items))
            # 报名时间
            BMSJ_31 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到',
                                 reFunction('五、[\s\S]*六、', items))
            # 报名地点
            BMDD_32 = reFunction(
                f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)向我局提交书面申请',
                reFunction('五、[\s\S]*六、', items))
            # 保证金截止时间
            BZJJZSJ_33 = reFunction(f'截止时间为(?:[\s]*)([{self.reStr}]*)\s*。经审',
                                    reFunction('五、[\s\S]*六、', items))
            # 确认竞买资格时间
            QRJMZGSJ_34 = reFunction(f'我局将在\s*([{self.reStr}]*)\s*前确认其竞买资格',
                                     reFunction('五、[\s\S]*六、', items))

            # TODO 联系地址
            LXDZ_35 = reFunction(f'联系地址:\s*([{self.reStr}]*)\s*联 系',
                                 reFunction('八、[\s\S]*', items))
            # 联系人
            LXR_36 = reFunction(f'联 系\s*人:\s*([{self.reStr}]*)\s*联系电话',
                                reFunction('八、[\s\S]*', items))
            # 开户单位
            KHDW_38 = reFunction(f'开户单位:\s*([{self.reStr}]*)\s*开户银行',
                                 reFunction('八、[\s\S]*', items))
            # 开户银行
            KHYH_39 = reFunction(f'开户银行:\s*([{self.reStr}]*)\s*银行帐号',
                                 reFunction('八、[\s\S]*', items))
            # 银行帐号
            YHZH_40 = reFunction(
                '^\d{17}',
                reFunction(f'银行帐号:\s*([{self.reStr}]*)\s*',
                           reFunction('八、[\s\S]*', items)))

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                FBSJ_2,
                WZLY_3,
                SYH_4,
                XXFL_5,
                FBJG_6,
                FBRQ_7,
                WH_8,
                SFYX_9,
                XXMC_10,
                ZWBT_11,
                ZDBH_12,
                ZDZMJ_13,
                ZDZL_14,
                SYNX_15,
                CRNX_16,
                RJL_17,
                JZMD_18,
                LDL_19,
                JZXG_20,
                TDYT_21,
                TZQD_22,
                BZJ_23,
                GJBGBAH_24,
                QSJ_25,
                JJFD_26,
                GPKSSJ_27,
                GPJZSJ_28,
                HQCRWJSJ_29,
                HQCRWJDD_30,
                BMSJ_31,
                BMDD_32,
                BZJJZSJ_33,
                QRJMZGSJ_34,
                LXDZ_35,
                LXR_36,
                LXDH_37,
                KHDW_38,
                KHYH_39,
                YHZH_40,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 2
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # TODO 共有字段
            # 年度
            ND_1 = response.meta.get('ND').split('-')[0] if response.meta.get(
                'ND') else ''
            '''商服用地  工矿仓储用地  基础设施及公益事业类划拨用地'''
            # TODO 先匹配 结构 在解析表格  结构优先,不为空先取结构
            ZFYD_2_0, ZFYD_2_1, ZFYD_2_2 = '', '', ''
            SFYD_3_0, SFYD_3_1, SFYD_3_2 = '', '', ''
            GKCC_4_0, GKCC_4_1, GKCC_4_2 = '', '', ''
            JCSS_5_0, JCSS_5_1, JCSS_5_2 = '', '', ''
            if '指标安排' in items:
                # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
                ZFYD_2_0 = reFunction(f'住房用地\s*([\d\万.]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 商服用地供应计划面积(公顷)
                SFYD_3_0 = reFunction(f'商服用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 工矿仓储用地供应计划面积(公顷)
                GKCC_4_0 = reFunction(f'工矿仓储用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 基础设施及公益事业等划拨用地供应计划面积(公顷)
                JCSS_5_0 = reFunction(f'基础设施及公益事业类划拨用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
            # TODO //table[@class="MsoNormalTable"][1] | //table[@border="1"] 通过表格获取
            if data.xpath(
                    '//table[@class="MsoNormalTable"][1] | //table[@border="1"]'
            ):
                soup = BeautifulSoup(response.body.decode('gbk'))
                tables = soup.find('table', attrs={'class': 'MsoNormalTable'})
                table = tables if tables else soup.find_all('table',
                                                            border="1")[0]
                htmlTable = htmlTableTransformer()
                try:
                    tdData = htmlTable.tableTrTdRegulation(table)
                except:
                    try:
                        table.tbody.find_all('tr')[0].extract()
                        tdData = htmlTable.tableTrTdRegulation(table)
                    except:
                        tdData = {}
                # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
                ZFYD_2_1 = tdData.get('住房用地(含保障性住房用地、商品住房用地)')
                # 商服用地供应计划面积(公顷)
                SFYD_3_1 = tdData.get('商服用地')
                # 工矿仓储用地供应计划面积(公顷)
                GKCC_4_1 = tdData.get('工矿仓储用地')
                # 基础设施及公益事业等划拨用地供应计划面积(公顷)
                JCSS_5_1 = tdData.get('基础设施及公益事业等划拨用地')
            # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
            ZFYD_2_2List = []
            ZFYD_2_2List.append(
                reFunction(f'全国住房用地计划供应*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2List.append(
                reFunction(f'住房用地计划供应*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2List.append(reFunction(f'住宅用地*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2 = list(filter(
                lambda x: len(x) > 1, ZFYD_2_2List))[0] if list(
                    filter(lambda x: len(x) > 1, ZFYD_2_2List)) else ''

            # 商服用地供应计划面积(公顷)
            SFYD_3_2_ = reFunction(f'商服用地\s*([\d\.万]*)公顷[,,]', items)
            SFYD_3_2__ = reFunction(
                f'商业用地计划出让\s*([\d\.万]*)公顷[,,]', items) if reFunction(
                    f'商业用地计划出让\s*([\d\.万]*)公顷,', items) else reFunction(
                        f'商业住房用地计划供应\s*([\d\.万]*)公顷[,,]', items)
            SFYD_3_2 = SFYD_3_2_ if SFYD_3_2_ else SFYD_3_2__

            # 工矿仓储用地供应计划面积(公顷
            GKCC_4_2 = reFunction(f'工矿仓储用地\s*([\d\.万]*)公顷[,,]', items)
            # 基础设施及公益事业等划拨用地供应计划面积(公顷)
            JCSS_5_2 = reFunction(f'基础设施及公益事业类划拨用地\s*([\d\.万]*)公顷[,,]', items)

            ZFYD_2 = list(filter(
                None, [ZFYD_2_0, ZFYD_2_1, ZFYD_2_2]))[0] if list(
                    filter(None, [ZFYD_2_0, ZFYD_2_1, ZFYD_2_2])) else ''
            SFYD_3 = list(filter(
                None, [SFYD_3_0, SFYD_3_1, SFYD_3_2]))[0] if list(
                    filter(None, [SFYD_3_0, SFYD_3_1, SFYD_3_2])) else ''
            GKCC_4 = list(filter(
                None, [GKCC_4_0, GKCC_4_1, GKCC_4_2]))[0] if list(
                    filter(None, [GKCC_4_0, GKCC_4_1, GKCC_4_2])) else ''
            JCSS_5 = list(filter(
                None, [JCSS_5_0, JCSS_5_1, JCSS_5_2]))[0] if list(
                    filter(None, [JCSS_5_0, JCSS_5_1, JCSS_5_2])) else ''

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                ND_1,
                ZFYD_2,
                SFYD_3,
                GKCC_4,
                JCSS_5,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 3
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            categorynum = response.meta.get('categorynum')
            infoid = response.meta.get('infoid')
            targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum
            results = ''
            for _ in range(5):
                try:
                    self.session.get(targetUrl,
                                     headers=self.header,
                                     allow_redirects=False,
                                     timeout=60)
                    redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect'
                    data = {'categorynum': categorynum, 'infoid': infoid}
                    response_ = self.session.post(redirectUrl,
                                                  headers=self.header,
                                                  data=data,
                                                  allow_redirects=False,
                                                  timeout=60)
                    url = 'https://www.cqggzy.com' + response_.json().get(
                        'custom') if 'http' not in response_.json().get(
                            'custom') else response_.json().get('custom')
                    results = self.session.get(url,
                                               headers=self.header,
                                               allow_redirects=False,
                                               timeout=60)
                    break
                except Exception as e:
                    pass

            data = Selector(text=results.content.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            XXSJ_2 = ''
            TDWZ_3 = ''
            YT_4 = ''
            TDMJ_5 = ''
            ZJRJZMJ_6 = ''
            ZDJZMD_7 = ''
            LDL_9 = ''
            CRJKQSJ_8 = ''
            JMBZJ_11 = ''
            BH_12 = ''
            CYNB_13 = ''
            KJMJ_14 = ''
            TZQD_15 = ''
            CCYQ_16 = ''
            BZ_17 = ''
            HQCRWJSJ_18 = ''
            HQCRWJDD_19 = ''
            BMSJ_20 = ''
            BMDD_21 = ''
            BZJJZSJ_22 = ''
            QRJMZGSJ_23 = ''
            LXDZ_24 = ''
            LXDH_25 = ''
            LXR_26 = ''

            # 共有字段
            # 文件标题
            WJBT_1 = data.xpath(
                '//*[@class="article-title"]/text()').extract_first()
            # 信息时间
            XXSJ_2 = reFunction(
                '(\d{4}-\d{1,2}-\d{1,2})',
                data.xpath(
                    '//*[@class="info-source"]/text()[1]').extract_first())
            if (('总计容建筑面' in items and '序号' in items)
                    or data.xpath('//table')) and '宗地编号' not in items:
                # TODO
                soup = BeautifulSoup(results.content.decode('utf-8'))
                tableMso = soup.find('table', 'MsoTableGrid')
                table = soup.find('table')
                htmlTable = htmlTableTransformer()
                try:
                    if tableMso:
                        tdData = htmlTable.table_tr_td(table)
                    else:
                        tdData = htmlTable.tableTrTdRegulation(table)
                    sourceTdData = tdData
                    for key, value in tdData.items():
                        tdData[key] = value.replace(str(key),
                                                    '') if value else value
                    # 土地位置   //table[@class="MsoNormalTable"]
                    TDWZ_3 = tdData.get('土地位置')
                    # 用途
                    YT_4 = tdData.get('土地用途') if tdData.get(
                        '土地用途') else tdData.get('用途')
                    # 土地面积(m)
                    TDMJ_5 = tdData.get('土地面积(m)') if tdData.get(
                        '土地面积 (m)') else tdData.get('土地面积 (㎡)')
                    # 总计容建筑面积(m2)
                    ZJRJZMJ_6 = tdData.get('总计容建筑面积(㎡)')
                    # 最大建筑密度
                    ZDJZMD_7 = tdData.get('最大建筑密度')
                    # 绿地率
                    LDL_9 = tdData.get('绿地率')
                    # TODO 正则匹配
                    if not ZDJZMD_7 and not LDL_9:
                        # sourceTdData
                        for value in sourceTdData.values():
                            if '最大建筑密度' in value:
                                ZDJZMD_7 = value.replace('最大建筑密度', '')
                            if '绿地率' in value:
                                LDL_9_ = value.replace('绿地率', '')
                                LDL_9 = LDL_9_ if len(
                                    LDL_9_
                                ) < 10 else reFunction(
                                    f'绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                                    value)
                            if '总计容建筑面积' in value:
                                LDL_9 = value.replace('总计容建筑面积(㎡)', '')
                    # 出让价款起始价(万元)
                    CRJKQSJ_8 = tdData.get('出让价款起始价(万元)')
                    # 投标竞买保证金(万元)  保证金(万元)
                    JMBZJ_11 = tdData.get('保证金(万元)') if tdData.get(
                        '保证金(万元)') else tdData.get('投标、竞买保证金(万元)')
                    # 编号
                    BH_12 = tdData.get('编号')
                    # 产业类别
                    CYNB_13 = tdData.get('产业类别')
                    # 可建面积(m2)或容积率
                    KJMJ_14 = tdData.get('可建面积(㎡)或容积率')
                    # 投资强度(万元 / 公顷)
                    TZQD_15 = tdData.get('投资强度(万元/公顷)')
                    # 产出要求(万元 / 公顷)
                    CCYQ_16 = tdData.get('产出要求(万元/公顷)')
                    # 备注  其他需要说明的宗地情况:
                    BZ_17_ = tdData.get('序号').split(
                        '备注:')[-1] if '备注' in tdData.get('序号') else tdData.get(
                            '备注:')
                    other = tdData.get('序号').split(
                        '其他需要说明的宗地情况:')[-1] if '其他需要说明的宗地情况:' in tdData.get(
                            '序号') else tdData.get('其他需要说明的宗地情况:')
                    BZ_17 = other if not BZ_17_ else BZ_17_
                    # 获取出让文件时间
                    HQCRWJSJ_18 = reFunction(
                        '竞买申请人可在([\w :\.\-\s\/\%,、]*)。',
                        reFunction('二、([\s\S]*)三、', items))
                    # 获取出让文件地点
                    HQCRWJDD_19 = reFunction(
                        '网址:([\w :\.\-\s\/\%,、]*)(?:[\)\s]*)',
                        reFunction('二、([\s\S]*)三、', items))
                    # 报名时间
                    BMSJ_20 = reFunction(
                        '竞买申请人可在([\w \.:\-\s\/\%,、]*)\(报名时间\)',
                        reFunction('三、([\s\S]*)四、', items))
                    # 保证金截止时间
                    BZJJZSJ_22 = reFunction(
                        '竞买保证金到账截止时间为([\w \.:\-\s\/\%,、]*)。',
                        reFunction('三、([\s\S]*)四、', items))
                    # 确认竞买资格时间
                    QRJMZGSJ_23 = BZJJZSJ_22
                    # 联系地址
                    LXDZ_24 = '|'.join(
                        re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                                   reFunction('七、([\s\S]*)', items)))
                    # 联系电话
                    LXDH_25 = '|'.join(
                        re.findall(
                            '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                            reFunction('七、([\s\S]*)', items)))
                    # 联系人
                    LXR_26 = '|'.join(
                        re.findall('联系人[::]([\w 、\.:\-\/\%,、()]*)(?:[ ,]*)',
                                   reFunction('七、([\s\S]*)', items)))
                except:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一、([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                    ]:
                        # 土地位置
                        TDWZ_3 += '|' + reFunction(
                            '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 用途
                        YT_4_1 = reFunction(
                            '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item)
                        YT_4_2 = reFunction(
                            '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)',
                            item)
                        YT_4 += '|' + YT_4_1 + YT_4_2
                        # 土地面积(m)
                        TDMJ_5 += '|' + reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) if reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) else '|' + reFunction(
                            '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item)
                        # 最大建筑密度
                        ZDJZMD_7 += '|' + reFunction(
                            '建筑密度\(%\):([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 绿地率
                        LDL_9 += '|' + reFunction(
                            '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item)
                        # 编号
                        BH_12 += '|' + reFunction(
                            '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 投资强度(万元 / 公顷)
                        TZQD_15 += '|' + reFunction(
                            '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 备注
                        BZ_17 += '|' + reFunction('备注:([\s\S]*)', item)
                    # TODO 获取出让文件时间
                    HQCRWJSJ_18 = reFunction(
                        '申请人可于([\w :\.\-\s\/\%,、]*)到',
                        reFunction('四、([\s\S]*)五、', items))
                    # 获取出让文件地点
                    HQCRWJDD_19 = reFunction(
                        '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取',
                        reFunction('四、([\s\S]*)五、', items))
                    # 报名时间
                    BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到',
                                         reFunction('五、([\s\S]*)六、', items))
                    # 保证金截止时间
                    BZJJZSJ_22 = reFunction(
                        '竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。',
                        reFunction('五、([\s\S]*)六、', items))
                    # 确认竞买资格时间
                    QRJMZGSJ_23 = BZJJZSJ_22
                    # 联系地址
                    LXDZ_24 = '|'.join(
                        re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                                   reFunction('八|七、([\s\S]*)', items)))
                    # 联系电话
                    LXDH_25 = '|'.join(
                        re.findall(
                            '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                            reFunction('八|七、([\s\S]*)', items)))
                    # 联系人
                    LXR_26 = '|'.join(
                        re.findall('联 系 人[::]([ \w]*)(?:[\n]*)',
                                   reFunction('八|七、([\s\S]*)', items)))
            else:
                for item in [
                        '宗地编号' + _ for _ in re.findall('一、([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 土地位置
                    TDWZ_3 += '|' + reFunction(
                        '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 用途
                    YT_4_1 = reFunction(
                        '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item)
                    YT_4_2 = reFunction(
                        '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)', item)
                    YT_4 += '|' + YT_4_1 + YT_4_2
                    # 土地面积(m)
                    TDMJ_5 += '|' + reFunction(
                        '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)',
                        item) if reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) else '|' + reFunction(
                            '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item)
                    # 最大建筑密度
                    ZDJZMD_7 += '|' + reFunction(
                        '建筑密度:([\w :\.\-\s\/\%,、≦;≥]*)(?:\s)', item)
                    # 绿地率
                    LDL_9 += '|' + reFunction(
                        '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item)
                    # 编号
                    BH_12 += '|' + reFunction(
                        '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 投资强度(万元 / 公顷)
                    TZQD_15 += '|' + reFunction(
                        '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 备注
                    BZ_17 += '|' + reFunction('备注:([\s\S]*)', item)
                # TODO 获取出让文件时间
                HQCRWJSJ_18 = reFunction('申请人可于([\w :\.\-\s\/\%,、]*)到',
                                         reFunction('四、([\s\S]*)五、', items))
                # 获取出让文件地点
                HQCRWJDD_19 = reFunction(
                    '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取',
                    reFunction('四、([\s\S]*)五、', items))
                # 报名时间
                BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到',
                                     reFunction('五、([\s\S]*)六、', items))
                # 保证金截止时间
                BZJJZSJ_22 = reFunction('竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。',
                                        reFunction('五、([\s\S]*)六、', items))
                # 确认竞买资格时间
                QRJMZGSJ_23 = BZJJZSJ_22
                # 联系地址
                LXDZ_24 = '|'.join(
                    re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                               reFunction('八|七、([\s\S]*)', items)))
                # 联系电话
                LXDH_25 = '|'.join(
                    re.findall('[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                               reFunction('八|七、([\s\S]*)', items)))
                # 联系人
                LXR_26 = '|'.join(
                    re.findall('联 系 人[::]([ \w]*)(?:[\n]*)',
                               reFunction('八|七、([\s\S]*)', items)))
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = url if url else response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                XXSJ_2,
                TDWZ_3,
                YT_4,
                TDMJ_5,
                ZJRJZMJ_6,
                ZDJZMD_7,
                LDL_9,
                CRJKQSJ_8,
                JMBZJ_11,
                BH_12,
                CYNB_13,
                KJMJ_14,
                TZQD_15,
                CCYQ_16,
                BZ_17,
                HQCRWJSJ_18,
                HQCRWJDD_19,
                BMSJ_20,
                BMDD_21,
                BZJJZSJ_22,
                QRJMZGSJ_23,
                LXDZ_24,
                LXDH_25,
                LXR_26,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            yield
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 4
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            WZLY_2 = ''
            GXSJ_3 = ''
            ZDBH_4 = ''
            ZDZL_5 = ''
            MJ_6 = ''
            TDYT_7 = ''
            CRNX_8 = ''
            RJL_9 = ''
            LDL_10 = ''
            JZMD_11 = ''
            JZXG_12 = ''
            JMBZJ_13 = ''
            QSJ_14 = ''
            ZJFD_15 = ''
            CRR_16 = ''
            QTSM_17 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_1 = response.meta.get('title')
            # 文章来源
            WZLY_2 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                ).extract_first().replace('文章来自:', '')
            # 更新时间
            GXSJ_3 = data.xpath('//div[@class="news_time"]/span[2]/text()'
                                ).extract_first().replace('更新时间:', '')
            # 备注
            QTSM_17 = reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s', items)

            # TODO //table[@border="1"]   //table[@border="0"]
            # table 解析
            if '宗地编号' not in items and '配套建筑规划用地' not in items:
                if data.xpath(
                        '//table[@border="0"]') and '主要规划指标' not in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5 = tdData.get('土地位置')
                    # 面积
                    MJ_6 = tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('土地用途')
                    # 出让年限
                    CRNX_8 = tdData.get('出让年限(年)') if tdData.get(
                        '出让年限(年)') else tdData.get('出让年限')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10 = tdData.get('绿地率') if tdData.get(
                        '绿地率') else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11 = tdData.get('建筑密度')
                    # 建筑限高
                    JZXG_12 = tdData.get('建筑高度')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(万元)') if tdData.get(
                        '竞买保证金(万元)') else tdData.get('竞买保证金(元)')
                    # 起始价
                    QSJ_14 = tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')
                if '规划指标要求' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace_ = table.tbody.find_all('tr')[0].find(
                        'td', colspan='4')
                    tdReplace = tdReplace_ if tdReplace_ else table.tbody.find_all(
                        'tr')[0].find('td', colspan='3')
                    try:
                        number = table.tbody.find_all('tr')[0].index(tdReplace)
                        tdList = table.tbody.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.tbody.find_all('tr')[0].insert(
                                number + _, tdList[_ - 1])
                        tdReplace.extract()
                        table.tbody.find_all('tr')[1].extract()
                    except:
                        pass
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5_ = tdData.get('土地位置') if tdData.get(
                        '土地位置') else tdData.get('地块位置/名称')
                    ZDZL_5 = ZDZL_5_.replace(
                        reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s',
                                   reFunction('一([\s\S]*)二', items)), '')
                    # 面积
                    MJ_6 = tdData.get('土地面积(m2)') if tdData.get(
                        '土地面积(m2)') else tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('土地用途') if tdData.get(
                        '土地用途') else tdData.get('规划地性质')
                    # 出让年限
                    CRNX_8_ = tdData.get(r'出让\u3000年限') if tdData.get(
                        r'出让\u3000年限') else tdData.get('出让年限')
                    CRNX_8 = CRNX_8_ if CRNX_8_ else tdData.get('出让年限(年)')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10_ = tdData.get('绿地率') if tdData.get(
                        '绿地率') else tdData.get('绿地率(%)')
                    LDL_10 = LDL_10_ if LDL_10_ else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11_ = tdData.get('建筑\u3000密度') if tdData.get(
                        '建筑\u3000密度') else tdData.get('建筑密度')
                    JZMD_11__ = JZMD_11_ if JZMD_11_ else tdData.get('建筑密度(%)')
                    JZMD_11 = JZMD_11__ if JZMD_11__ else tdData.get(
                        '建筑\u3000密度(不大于)')
                    # 建筑限高
                    JZXG_12_ = tdData.get('建筑限高') if tdData.get(
                        '建筑限高') else tdData.get('建筑高度(m)')
                    JZXG_12__ = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度')
                    JZXG_12 = JZXG_12__ if JZXG_12__ else tdData.get(
                        '建筑限高(不高于)')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get(
                        '竞买保证金(元)') else tdData.get('竞买保证金(万元)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('挂牌出让起始价(元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')
                    if ZJFD_15 == '' and QSJ_14 == '' and JMBZJ_13 == '':
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace0 = table.tbody.find_all('tr')[0].find_all(
                            'td')[-1]  # 第一个
                        tdReplace1 = table.tbody.find_all('tr')[1].find_all(
                            'td')[-1]  # 第二个
                        number0 = table.tbody.find_all('tr')[0].index(
                            tdReplace0)  # 第一个index
                        number1 = table.tbody.find_all('tr')[1].index(
                            tdReplace1)  # 第二个index
                        tdList2 = table.tbody.find_all('tr')[2].find_all(
                            'td')  # 第二个
                        tdList3 = table.tbody.find_all('tr')[3].find_all(
                            'td')  # 第四个
                        for _ in range(1, len(tdList2) + 1):
                            table.tbody.find_all('tr')[0].insert(
                                number0 + _, tdList2[_ - 1])
                        for _ in range(1, len(tdList3) + 1):
                            table.tbody.find_all('tr')[1].insert(
                                number1 + _, tdList3[_ - 1])
                        table.tbody.find_all('tr')[2].extract()

                        htmlTable = htmlTableTransformer()
                        tdDataCopy = htmlTable.tableTrTdRegulation(table)
                        # 竞买保证金
                        JMBZJ_13 = tdDataCopy.get(
                            '竞买保证金(元)') if tdDataCopy.get(
                                '竞买保证金(元)') else tdDataCopy.get('竞买保证金(万元)')
                        # 起始价
                        QSJ_14_ = tdDataCopy.get('起始价(元)') if tdDataCopy.get(
                            '起始价(元)') else tdDataCopy.get('挂牌出让起始价(元)')
                        QSJ_14 = QSJ_14_ if QSJ_14_ else tdDataCopy.get(
                            '起始价(万元)')
                        # 增价幅度
                        ZJFD_15 = tdDataCopy.get('增价幅度(万元)') if tdDataCopy.get(
                            '增价幅度(万元)') else tdDataCopy.get('加价幅度')
                    # 出让人
                if '标的序号' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table', border='0')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.table_tr_td(table)
                    # 宗地坐落
                    ZDZL_5 = tdData.get('标的位置')
                    # 面积
                    MJ_6 = tdData.get('土地面积') if tdData.get(
                        '土地面积') else tdData.get('土地面积(平方米)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('拍卖参考价(万元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 出让年限
                    CRNX_8 = tdData.get('土地性质(年限)') if tdData.get(
                        '土地性质(年限)') else tdData.get('出让年限(年)')
            else:
                if '宗地编号' in items:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二', items)[0].split('宗地编号')[1:]
                    ]:
                        # 宗地编号
                        ZDBH_4 += '|' + reFunction(
                            f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 宗地坐落
                        ZDZL_5 += '|' + reFunction(
                            f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 面积
                        MJ_6 += '|' + reFunction(
                            f'宗地面积:(?:[\s]*)([{self.reStr}]*)\s', item)

                        # 出让年限
                        CRNX_8 += '|' + reFunction(
                            f'出让年限:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 容积率
                        RJL_9 += '|' + reFunction(
                            f'容积率:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 绿地率
                        LDL_10 += '|' + reFunction(
                            f'绿地率\(%\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 建筑密度
                        JZMD_11 += '|' + reFunction(
                            f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 建筑限高
                        JZXG_12 += '|' + reFunction(
                            f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 竞买保证金
                        JMBZJ_13 += '|' + reFunction(
                            f'保证金:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 起始价
                        QSJ_14 += '|' + reFunction(
                            f'起始价:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 增价幅度
                        ZJFD_15 += '|' + reFunction(
                            f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 出让人
                        # CRR_16 += '|' +  reFunction(f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 其他说明
                        QTSM_17 += '|' + reFunction(
                            f'备注:(?:[\s]*)([{self.reStr}]*)\s', item)
                if '配套建筑规划用地' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace0 = table.tbody.find_all('tr')[0].find_all('td')[
                        -1]  # 第一个
                    tdReplace1 = table.tbody.find_all('tr')[1].find_all('td')[
                        -1]  # 第二个
                    number0 = table.tbody.find_all('tr')[0].index(
                        tdReplace0)  # 第一个index
                    number1 = table.tbody.find_all('tr')[1].index(
                        tdReplace1)  # 第二个index
                    tdList2 = table.tbody.find_all('tr')[2].find_all(
                        'td')  # 第二个
                    tdList3 = table.tbody.find_all('tr')[3].find_all(
                        'td')  # 第四个
                    for _ in range(1, len(tdList2) + 1):
                        table.tbody.find_all('tr')[0].insert(
                            number0 + _, tdList2[_ - 1])
                    for _ in range(1, len(tdList3) + 1):
                        table.tbody.find_all('tr')[1].insert(
                            number1 + _, tdList3[_ - 1])
                    table.tbody.find_all('tr')[2].extract()
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5 = tdData.get('地块位置/名称')
                    # 面积
                    MJ_6 = tdData.get('配套设施出让面积(m2)') if tdData.get(
                        '配套设施出让面积(m2)') else tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('配套建筑规划用地性质')
                    # 出让年限
                    CRNX_8 = tdData.get('出让年限') if tdData.get(
                        '出让年限') else tdData.get('出让年限(年)')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10 = tdData.get('公园整体绿地率(%)') if tdData.get(
                        '公园整体绿地率(%)') else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11 = tdData.get('公园整体建筑密度(%)')
                    # 建筑限高
                    JZXG_12_ = tdData.get('建筑限高') if tdData.get(
                        '建筑限高') else tdData.get('建筑高度(m)')
                    JZXG_12 = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get(
                        '竞买保证金(元)') else tdData.get('竞买保证金(万元)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('配套设施用地挂牌出让起始价(元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                WZLY_2,
                GXSJ_3,
                ZDBH_4,
                ZDZL_5,
                MJ_6,
                TDYT_7,
                CRNX_8,
                RJL_9,
                LDL_10,
                JZMD_11,
                JZXG_12,
                JMBZJ_13,
                QSJ_14,
                ZJFD_15,
                QTSM_17,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 5
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_27 = ''
            SJ_28 = ''
            LY_29 = ''
            WJBT_30 = ''
            ZDBH_31 = ''
            BH_32 = ''
            DKWZ_33 = ''
            TDWZ_34 = ''
            TDMJM_35 = ''
            TDMJPFM_36 = ''
            TDYT_37 = ''
            CJJ_38 = ''
            JDR_39 = ''
            GSQ_40 = ''
            LXDW_41 = ''
            DWDZ_42 = ''
            YZBM_43 = ''
            LXDH_44 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_27 = response.meta.get('title')
            # 时间
            SJ_28 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_29 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 文件编号
            WJBT_30 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 公示期
            GSQ_40 = reFunction(
                f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)。', items)
            # 联系单位
            LXDW_41 = reFunction(
                '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_42 = reFunction(
                '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_43 = reFunction(
                '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_44 = reFunction(
                '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_27 + SJ_28)

            soup = BeautifulSoup(
                response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if table:
                if '竣工时间' in items:
                    try:
                        tdData = htmlTable.tableTrTdUNregulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_31 = tdData.get('地块编号')[_] if tdData.get(
                                '地块编号') else ''
                            # 地块位置
                            DKWZ_33 = tdData.get('位置')[_] if tdData.get(
                                '位置') else ''
                            # 土地位置
                            TDWZ_34 = tdData.get('位置')[_] if tdData.get(
                                '位置') else ''
                            # 土地面积(亩)
                            TDMJM_35 = tdData.get(
                                '出让面积平方米/亩')[_] if tdData.get(
                                    '出让面积平方米/亩') else ''
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.get(list(
                                tdData.keys())[7])[_] if tdData.get(
                                    list(tdData.keys())[7]) else ''
                            # 土地用途
                            TDYT_37 = tdData.get('用途')[_] if tdData.get(
                                '用途') else ''
                            # 成交价(万元)
                            CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get(
                                '成交价(万元)') else tdData.get(
                                    '成交价(万元)')[_] if tdData.get(
                                        '成交价(万元)') else ''
                            # 竞得人
                            JDR_39 = tdData.get('受让人')[_] if tdData.get(
                                '受让人') else ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                    except:
                        for tdData in table.find_all('tr')[2:]:
                            # 宗地编号
                            ZDBH_31 = tdData.find_all('td')[4].string.strip()
                            # 地块位置
                            DKWZ_33 = tdData.find_all('td')[5].string.strip()
                            # 土地位置
                            TDWZ_34 = tdData.find_all('td')[5].string.strip()
                            # 土地面积(亩)
                            TDMJM_35 = tdData.find_all('td')[6].string.strip()
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.find_all(
                                'td')[7].string.strip()
                            # 土地用途
                            TDYT_37 = tdData.find_all('td')[8].string.strip()
                            # 成交价(万元)
                            CJJ_38 = tdData.find_all('td')[9].string.strip()
                            # 竞得人
                            JDR_39 = tdData.find_all('td')[3].string.strip()
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                elif '转让方' not in items:
                    if len(table.find_all('tr')[1].find_all('td')) < 5:
                        table.find_all('tr')[1].extract()
                        table.find_all('tr')[0].find_all('td')[-1].extract()
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    for _ in range(len(list(tdData.values())[0])):
                        # 宗地编号
                        ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get(
                            '宗地编号') else ''
                        # 编号
                        BH_32 = tdData.get('编号')[_] if tdData.get('编号') else ''
                        # 地块位置
                        DKWZ_33 = tdData.get('地块位置')[_] if tdData.get(
                            '地块位置') else ''
                        # 土地位置
                        TDWZ_34 = tdData.get('土地位置')[_] if tdData.get(
                            '土地位置') else ''
                        # 土地面积(亩)
                        TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get(
                            '土地面积(亩)') else ''
                        # 土地面积(平方米)
                        TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get(
                            '土地面积(平方米)') else ''
                        # 土地用途
                        TDYT_37 = tdData.get('土地用途')[_] if tdData.get(
                            '土地用途') else ''
                        # 成交价(万元)
                        CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get(
                            '成交价(万元)') else tdData.get(
                                '成交价(万元)')[_] if tdData.get('成交价(万元)') else ''
                        # 竞得人
                        JDR_39 = tdData.get('竞得人')[_] if tdData.get(
                            '竞得人') else ''

                        # 写入数据
                        if self.name in DUPLICATE_SWITCH_LIST:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if TDYT_37:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    WJBT_27,
                                    SJ_28,
                                    LY_29,
                                    WJBT_30,
                                    ZDBH_31,
                                    BH_32,
                                    DKWZ_33,
                                    TDWZ_34,
                                    TDMJM_35,
                                    TDMJPFM_36,
                                    TDYT_37,
                                    CJJ_38,
                                    JDR_39,
                                    GSQ_40,
                                    LXDW_41,
                                    DWDZ_42,
                                    YZBM_43,
                                    LXDH_44,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                elif '地块基本情况' in items:
                    # 宗地编号
                    ZDBH_31 = reFunction(
                        '宗地编号\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 地块位置
                    DKWZ_33 = reFunction(
                        '地块位置\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 土地面积(亩)
                    TDMJM_35 = reFunction(
                        '土地面积\(公顷\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 土地用途
                    TDYT_37 = reFunction(
                        '土地用途\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 成交价(万元)
                    CJJ_38 = reFunction(
                        '成交价\(万元\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 竞得人
                    JDR_39 = reFunction(
                        '受让单位\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)

                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if TDYT_37:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_27,
                                SJ_28,
                                LY_29,
                                WJBT_30,
                                ZDBH_31,
                                BH_32,
                                DKWZ_33,
                                TDWZ_34,
                                TDMJM_35,
                                TDMJPFM_36,
                                TDYT_37,
                                CJJ_38,
                                JDR_39,
                                GSQ_40,
                                LXDW_41,
                                DWDZ_42,
                                YZBM_43,
                                LXDH_44,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '转让方' in items:
                # 编号
                BH_32 = reFunction(
                    '不动产权登记证号:([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 地块位置
                DKWZ_33 = reFunction(
                    '宗地位置:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 土地面积(平方米)
                TDMJPFM_36 = reFunction(
                    '面\s*积:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 土地用途
                TDYT_37 = reFunction(
                    '土地用途:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 成交价(万元)
                # CJJ_38
                # 竞得人
                JDR_39 = reFunction(
                    '受让方:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if TDYT_37:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_27,
                            SJ_28,
                            LY_29,
                            WJBT_30,
                            ZDBH_31,
                            BH_32,
                            DKWZ_33,
                            TDWZ_34,
                            TDMJM_35,
                            TDMJPFM_36,
                            TDYT_37,
                            CJJ_38,
                            JDR_39,
                            GSQ_40,
                            LXDW_41,
                            DWDZ_42,
                            YZBM_43,
                            LXDH_44,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
Ejemplo n.º 6
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_18 = ''
            LY_19 = ''
            GXSJ_20 = ''
            GGSJ_21 = ''
            GGMT_22 = ''
            GGH_23 = ''
            CRFS_24 = ''
            CJSJ_25 = ''
            CJDD_26 = ''
            DKBH_27 = ''
            DKWZ_28 = ''
            TDYT_29 = ''
            GPQSJ_30 = ''
            JDDW_31 = ''
            CJJE_32 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_18 = response.meta.get('title')
            # 文章来源
            WZLY_19 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                 ).extract_first().replace('文章来自:', '')
            # 更新时间
            GXSJ_20 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                 ).extract_first().replace('更新时间:', '')

            # TODO //table[@border="1"]   //table[@border="0"]
            soup = BeautifulSoup(response.body.decode('utf-8'))
            tables = soup.find_all('table')

            tablesCopy = BeautifulSoup(
                response.body.decode('utf-8')).find_all('table')

            for _ in range(len(tables)):
                table = tables[_]
                tableCopy = tablesCopy[_]
                # table 解析 首先解析第一行, 删除异常行,
                trList = table.tbody.find_all('tr')
                for _ in range(len(trList)):
                    if not trList[_].find_all('td', text=re.compile(
                            "公告时间")) and not trList[_].find_all(
                                'p', text=re.compile("公告时间")):
                        table.tbody.find_all('tr')[0].extract()  # 处理异常行
                        continue
                    break

                trListCopy = tableCopy.tbody.find_all('tr')
                for _ in range(len(trListCopy)):
                    if not trListCopy[_].find_all(
                            'td', text=re.compile(
                                "公告时间")) and not trListCopy[_].find_all(
                                    'p', text=re.compile("公告时间")):
                        tableCopy.tbody.find_all('tr')[0].extract()  # 处理异常行
                        continue
                    break

                for _ in range(2, len(tableCopy.tbody.find_all('tr'))):
                    try:
                        tableCopy.tbody.find_all('tr')[2].extract()
                    except:
                        pass
                htmlTable = htmlTableTransformer()
                tdDataCopy = htmlTable.tableTrTdRegulation(tableCopy)
                # 公告时间
                GGSJ_21 = tdDataCopy.get('公告时间')
                # 公告媒体
                GGMT_22 = tdDataCopy.get('公告媒体')
                # 公告号
                GGH_23 = tdDataCopy.get('公告号')
                # 出让方式
                CRFS_24 = tdDataCopy.get('出让方式')
                # 成交时间
                CJSJ_25 = tdDataCopy.get('成交时间')
                # 成交地点
                CJDD_26 = tdDataCopy.get('成交地点')

                # TODO 解析第二行
                for _ in range(2):
                    try:
                        table.tbody.find_all('tr')[0].extract()
                    except:
                        pass
                htmlTable = htmlTableTransformer()
                tdData = htmlTable.tableTrTdRegulation(table)
                # 地块编号
                DKBH_27 = tdData.get('地块编号')
                # 地块位置
                DKWZ_28 = tdData.get('地块位置')
                # 土地用途
                TDYT_29 = tdData.get('土地用途')
                # 挂牌起始价
                GPQSJ_30 = tdData.get('挂牌起始价(万元)') if tdData.get(
                    '挂牌起始价(万元)') else tdData.get('挂牌起始价(元)')
                # 竞得单位
                JDDW_31 = tdData.get('竞得人(单位)') if tdData.get(
                    '竞得人(单位)') else tdData.get('竞得单位')
                # 成交金额
                CJJE_32_ = tdData.get('成交价(万元)') if tdData.get(
                    '成交价(万元)') else tdData.get('成交金额(元)')
                CJJE_32 = CJJE_32_ if CJJE_32_ else tdData.get('成交价')
                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + WJBT_18 + GXSJ_20)

                # 存储数据
                csvFile = [
                    WJBT_18,
                    LY_19,
                    GXSJ_20,
                    GGSJ_21,
                    GGMT_22,
                    GGH_23,
                    CRFS_24,
                    CJSJ_25,
                    CJDD_26,
                    DKBH_27,
                    DKWZ_28,
                    TDYT_29,
                    GPQSJ_30,
                    JDDW_31,
                    CJJE_32,
                    crawlingTime,
                    url,
                    md5Mark,
                ]
                results = ''
                for _ in csvFile:
                    try:
                        if _ and _ != '|' * len(_):
                            results += _.replace(',', ' ').replace(
                                '\n', '').replace('\r', '').replace(
                                    r'\xa0', '').replace('\xa0', '') + ','
                        else:
                            results += ','
                    except Exception as e:
                        results += ','
                        self.log(
                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                            level=logging.ERROR)
                with open(self.pathDetail, 'a+') as fp:
                    fp.write(results)
                    fp.write('\n')
                self.log(f'数据获取成功', level=logging.INFO)
                yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 7
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            BT_47 = ''
            LY_55 = ''
            LYSJ_48 = ''
            XH_49 = ''
            PZWH_50 = ''
            YDDW_51 = ''
            GDFS_52 = ''
            PZSJ_53 = ''
            WZ_54 = ''
            YT_55 = ''
            MJ_56 = ''
            RJL_57 = ''
            GYWAFA_58 = ''

            # TODO 共有字段
            # 标题
            BT_47 = response.meta.get('title')
            LY = data.xpath(
                '//div[@class="content-small-title"]/text()').extract_first()
            # 来源
            LY_55 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY)
            # 时间
            LYSJ_48 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 解析 table 若出错 使用正则
            htmlTable = htmlTableTransformer()
            if '宗地编号' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find_all('table')[0]

                    if not table.tbody.find_all('tr')[0].find_all(
                            text=re.compile("序号|受让人")):
                        table.tbody.find_all('tr')[0].extract()
                    tdsData = htmlTable.tableTrTdRegulationToList(table)

                    for _ in range(len(list(tdsData.values())[0])):
                        # if response.url == 'http://zzland.zhengzhou.gov.cn/hbgd/1715241.jhtml':
                        #     print()
                        # 序号
                        XH_49 = tdsData.get('序号')[_] if tdsData.get(
                            '序号') else ''
                        # 批准文号
                        PZWH_50 = tdsData.get('批准文号')[_] if tdsData.get(
                            '批准文号') else ''
                        # 用地单位
                        YDDW_51_ = tdsData.get('用地单位(受让人)')[_] if tdsData.get(
                            '用地单位(受让人)') else tdsData.get(
                                '受让人')[_] if tdsData.get('受让人') else ''
                        YDDW_51 = YDDW_51_ if YDDW_51_ else tdsData.get(
                            '单位')[_]
                        # 供地方式
                        GDFS_52 = tdsData.get('供地方式')[_] if tdsData.get(
                            '供地方式') else tdsData.get('供应方式')[_] if tdsData.get(
                                '供应方式') else ''
                        # 批准时间
                        PZSJ_53 = tdsData.get('批准时间')[_] if tdsData.get(
                            '批准时间') else tdsData.get('签订日期')[_] if tdsData.get(
                                '签订日期') else ''
                        # 位置
                        WZ_54_0 = tdsData.get('土地位置')
                        WZ_54_1 = tdsData.get('土地座落')
                        WZ_54_2 = tdsData.get('宗地位置')
                        WZ_54_3 = tdsData.get('位置')
                        WZ_54_ = list(
                            filter(None, [WZ_54_0, WZ_54_1, WZ_54_2, WZ_54_3]))
                        WZ_54 = WZ_54_[0][_] if WZ_54_ else ''
                        # 用途
                        YT_55_0 = tdsData.get('用途')
                        YT_55_1 = tdsData.get('土地用途')
                        YT_55_2 = tdsData.get('用途明细')
                        YT_55_ = list(filter(None,
                                             [YT_55_0, YT_55_1, YT_55_2]))
                        YT_55 = YT_55_[0][_] if YT_55_ else ''
                        # 面积
                        MJ_56_0 = tdsData.get('面积(平方米)')
                        MJ_56_1 = tdsData.get('划拨面积')
                        MJ_56_2 = tdsData.get('出让/划拨面积')
                        MJ_56_3 = tdsData.get('面积(公顷)')
                        MJ_56_ = list(
                            filter(None, [MJ_56_0, MJ_56_1, MJ_56_2, MJ_56_3]))
                        MJ_56 = MJ_56_[0][_] if MJ_56_ else ''
                        # 容积率
                        RJL_57 = tdsData.get('容积率')[_] if tdsData.get(
                            '容积率') else ''
                        # 供应方案文号
                        GYWAFA_58 = tdsData.get('供应方案文号')[_] if tdsData.get(
                            '供应方案文号') else ''
                        # 爬取时间
                        crawlingTime = time.strftime("%Y-%m-%d",
                                                     time.localtime())
                        # 爬取地址url
                        url = response.url
                        # 唯一标识
                        md5Mark = encrypt_md5(url + BT_47 + LYSJ_48)

                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                BT_47,
                                LY_55,
                                LYSJ_48,
                                XH_49,
                                PZWH_50,
                                YDDW_51,
                                GDFS_52,
                                PZSJ_53,
                                WZ_54,
                                YT_55,
                                MJ_56,
                                RJL_57,
                                GYWAFA_58,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                except Exception as e:
                    pass
            else:
                # 进行正则匹配
                # 序号
                XH_49 = reFunction(f'宗地编号([{self.reStr}]*)地块位置', items)
                # 用地单位
                YDDW_51 = reFunction(f'受让单位([{self.reStr}]*)备注:', items)
                # 位置
                WZ_54 = reFunction(f'地块位置([{self.reStr}]*)土地用途', items)
                # 用途
                YT_55 = reFunction(f'土地用途([{self.reStr}]*)土地面积', items)
                # 面积
                MJ_56 = reFunction(f'土地面积\(公顷\)([{self.reStr}]*)项目名称', items)
                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + BT_47 + LYSJ_48)

                # 是否需要判断重复 请求
                if DUPLICATE_SWITCH:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    # 重复效验通过, 存储数据
                    csvFile = [
                        BT_47,
                        LY_55,
                        LYSJ_48,
                        XH_49,
                        PZWH_50,
                        YDDW_51,
                        GDFS_52,
                        PZSJ_53,
                        WZ_54,
                        YT_55,
                        MJ_56,
                        RJL_57,
                        GYWAFA_58,
                        crawlingTime,
                        url,
                        md5Mark,
                    ]
                    results = ''
                    for _ in csvFile:
                        try:
                            if _ and _ != '|' * len(_):
                                results += _.replace(',', ' ').replace(
                                    '\n', '').replace('\r', '').replace(
                                        r'\xa0', '').replace('\xa0', '') + ','
                            else:
                                results += ','
                        except Exception as e:
                            results += ','
                            self.log(
                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                level=logging.ERROR)
                    with open(self.pathDetail, 'a+') as fp:
                        fp.write(results)
                        fp.write('\n')
                    self.log(f'数据获取成功', level=logging.INFO)
                    yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
Ejemplo n.º 8
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            GGLX_1 = ''
            WJBT_2 = ''
            SJ_3 = ''
            LY_4 = ''
            ZWBT_5 = ''
            ZDBH_6 = ''
            TDWZ_7 = ''
            CRMJ_8 = ''
            LHYD_9 = ''
            DLYD_10 = ''
            TDYT_11 = ''
            CRNX_12 = ''
            RJL_13 = ''
            JZMD_14 = ''
            LDL_15 = ''
            JZKJ_16 = ''
            QSJ_17 = ''
            BZJ_18 = ''
            JJFD_19 = ''
            BMRQ_20 = ''
            GPRQ_21 = ''
            GPJZSJ_22 = ''
            BZJDZSJ_23 = ''
            LXDZ_24 = ''
            LXR_25 = ''
            LXDH_26 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 公告类型
            GGLX_1 = '出让公告'
            # 文件标题
            WJBT_2 = response.meta.get('title')
            # 时间
            SJ_3 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_4 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 正文标题
            ZWBT_5 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_2 + SJ_3)
            # 报名时间起止日期
            BMRQ_20 = reFunction(f'报名申请时间:\s*([\w]*);', items) if reFunction(
                f'报名申请时间:\s*([\w]*);', items
            ) else reFunction(f'申请人可于(\w*),向我局提交书面申请', items) if reFunction(
                f'申请人可于(\w*),向我局提交书面申请', items
            ) else reFunction(f'申请时间为:(\w*)', items) if reFunction(
                f'申请时间为:(\w*)', items) else reFunction(f'申请人可于(\w*)到', items)
            GPTime = reFunction(f'网上挂牌(报价)时间:\s*([\w]*)', items) if reFunction(
                f'网上挂牌(报价)时间:\s*([\w]*)', items) else reFunction(
                    f'挂牌时间为:\s*([\w]*)', items)
            try:
                if GPTime:
                    # 挂牌开始时间
                    GPRQ_21 = GPTime.split('至')[0]
                    # 挂牌截止时间
                    GPJZSJ_22 = GPTime.split('至')[1]
                else:
                    GPRQ_21 = reFunction(f'挂牌时间为:\s*([\s\S]*)',
                                         reFunction('六、([\s\S]*)七、', items))
                    GPJZSJ_22 = reFunction(f'挂牌时间为:\s*([\s\S]*)',
                                           reFunction('六、([\s\S]*)七、', items))
            except Exception as e:
                self.log(f'详情页数据挂牌时间解析失败, 请求:{response.url}, 信息: {e}',
                         level=logging.DEBUG)
                GPRQ_21 = ''
                GPJZSJ_22 = ''
            # 保证金到账截止时间
            BZJDZSJ_23 = reFunction(
                f'保证金到账截止时间为:\s*([\w]*)', items) if reFunction(
                    f'保证金到账截止时间为:\s*([\w]*)', items) else reFunction(
                        f'保证金交纳截止时间:\s*([\w]*)', items) if reFunction(
                            f'保证金交纳截止时间:\s*([\w]*)', items) else reFunction(
                                f'保证金的截止时间为\s*([\w]*)', items)
            # 联系地址
            LXDZ_24 = reFunction(
                '联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                items) if reFunction(
                    f'联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                    items) else reFunction(
                        '单位地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            # 联系人
            LXR_25 = reFunction(
                f'联\s系\s人:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            # 联系电话
            LXDH_26 = reFunction(
                f'联系电话:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            if '挂牌出让宗地的基本情况和规划指标等要求' not in items and '宗地编号' not in items:
                # 处理 table 情况
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                try:
                    tdReplace = table.tbody.find_all('tr')[0].find(
                        'td',
                        colspan='4') if table.tbody.find_all('tr')[0].find(
                            'td', colspan='4') else table.tbody.find_all(
                                'tr')[0].find('td', colspan="2")
                    number = table.tbody.find_all('tr')[0].index(tdReplace)
                    tdList = table.tbody.find_all('tr')[1].find_all('td')
                    for _ in range(1, len(tdList) + 1):
                        table.tbody.find_all('tr')[0].insert(
                            number + _, tdList[_ - 1])
                    tdReplace.extract()
                    table.tbody.find_all('tr')[1].extract()
                except:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace = table.thead.find_all('tr')[0].find(
                        'td',
                        colspan='4') if table.thead.find_all('tr')[0].find(
                            'td', colspan='4') else table.thead.find_all(
                                'tr')[0].find('td', colspan="2")
                    number = table.thead.find_all('tr')[0].index(tdReplace)
                    tdList = table.thead.find_all('tr')[1].find_all('td')
                    for _ in range(1, len(tdList) + 1):
                        table.thead.find_all('tr')[0].insert(
                            number + _, tdList[_ - 1])
                    tdReplace.extract()
                    table.thead.find_all('tr')[1].extract()
                    table.tbody.insert(
                        0,
                        table.thead.find_all('tr')[0])  # 插入 thead 的内容
                    table.thead.extract()
                htmlTable = htmlTableTransformer()
                try:
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    if not tdData and 'thead' in items:  # 如果没有拿到 则可能存在 thead
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace = table.thead.find_all('tr')[0].find(
                            'td',
                            colspan='4') if table.thead.find_all('tr')[0].find(
                                'td', colspan='4') else table.thead.find_all(
                                    'tr')[0].find('td', colspan="2")
                        number = table.thead.find_all('tr')[0].index(tdReplace)
                        tdList = table.thead.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.thead.find_all('tr')[0].insert(
                                number + _, tdList[_ - 1])
                        tdReplace.extract()
                        table.thead.find_all('tr')[1].extract()
                        table.tbody.insert(
                            0,
                            table.thead.find_all('tr')[0])  # 插入 thead 的内容
                        table.thead.extract()
                        htmlTable = htmlTableTransformer()
                except:
                    tdData = {}
                for _ in range(len(list(tdData.values())[0])):
                    # 宗地编号
                    ZDBH_6 = tdData.get('编号')[_] if tdData.get('编号') else ''
                    # 土地位置
                    TDWZ_7 = tdData.get('土地位置')[_] if tdData.get(
                        '土地位置') else ''
                    # 出让面积(m2)
                    CRMJ_8_0 = tdData.get('土地面积')
                    CRMJ_8_1 = tdData.get('土地面积(平方米)')
                    CRMJ_8_ = list(filter(None, [CRMJ_8_0, CRMJ_8_1]))
                    CRMJ_8 = CRMJ_8_[0][_] if CRMJ_8_ else ''
                    # 土地用途
                    TDYT_11 = tdData.get('土地用途')[_] if tdData.get(
                        '土地用途') else ''
                    # 岀让年限
                    CRNX_12 = tdData.get('出让年限(年)')[_] if tdData.get(
                        '出让年限(年)') else ''
                    # 容积率
                    RJL_13 = tdData.get('容积率')[_] if tdData.get(
                        '容积率') else tdData.get('容 积 率')[_] if tdData.get(
                            '容 积 率') else ''
                    # 建筑密度
                    # JZMD_14
                    # 绿地率
                    LDL_15 = tdData.get('绿化率')[_] if tdData.get('绿化率') else ''
                    # 建筑空间
                    JZKJ_16 = tdData.get('控制高度(m)')[_] if tdData.get(
                        '控制高度(m)') else tdData.get('建筑限高(m)')[_] if tdData.get(
                            '建筑限高(m)') else ''
                    # 起始价(万元)
                    QSJ_17 = tdData.get('挂牌起始价(万元)')[_] if tdData.get(
                        '挂牌起始价(万元)') else ''
                    # 保证金(万元)
                    BZJ_18 = tdData.get('竞买保证金(万元)')[_] if tdData.get(
                        '竞买保证金(万元)') else tdData.get(
                            '竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else ''
                    # 竞价幅度(万元)
                    JJFD_19 = tdData.get('増价幅度(万元/次)')[_] if tdData.get(
                        '増价幅度(万元/次)') else ''
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            # TODO 判断
            elif '挂牌出让宗地的基本情况和规划指标等要求' in items:
                for item in re.split(
                        '\d、',
                        reFunction('一、挂牌出让宗地的基本情况和规划指标等要求:([\s\S]*)二、',
                                   items)):
                    # TODO
                    if not item.strip():
                        continue
                    # 宗地编号
                    ZDBH_6 = reFunction(
                        f'^([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)宗地位于', item)
                    # 土地位置
                    TDWZ_7 = reFunction(
                        f'宗地位于([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 出让面积(m2)
                    CRMJ_8 = reFunction(
                        f'土地出让面积([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 土地用途
                    TDYT_11 = reFunction(
                        f'宗地规划用途为([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 岀让年限
                    CRNX_12 = reFunction(
                        f'宗地土地出让年期([()\w\.:: —\(\),〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)。',
                        item)
                    # 容积率
                    RJL_13 = reFunction(
                        f'容积率([()\w\.:: \(\)%〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 建筑密度
                    JZMD_14 = reFunction(
                        f'建筑密度([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 绿地率
                    LDL_15 = reFunction(
                        f'绿地率([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 建筑空间
                    JZKJ_16 = reFunction(
                        f'建筑空间([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 起始价(万元)
                    QSJ_17 = reFunction(
                        f'本宗地起始价([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 保证金(万元)
                    BZJ_18 = reFunction(
                        f'竞买保证金([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item)
                    # 竞价幅度(万元)
                    JJFD_19 = reFunction(
                        f'增价幅度([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item)
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '挂牌出让地块基本情况' in items and '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    ZDBH_6 = reFunction(
                        f'宗地编号为([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 土地位置
                    TDWZ_7 = reFunction(
                        f'该地块([()\w\.:: —\(\)〔〕%㎡≤≥《》,\-\/\%;、\.﹪]*)。出让面积',
                        item)
                    # 出让面积(m2)
                    CRMJ_8 = reFunction(
                        f'出让面积:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 绿化用地
                    LHYD_9 = reFunction(
                        f'绿化用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 道路用地
                    DLYD_10 = reFunction(
                        f'道路用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 土地用途
                    TDYT_11 = reFunction(
                        f'用途:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 岀让年限
                    CRNX_12 = reFunction(
                        f'出让年限:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 容积率
                    RJL_13 = reFunction(
                        f'容积率:*([()\w\.:: ,—\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 建筑密度
                    JZMD_14 = reFunction(
                        f'建筑密度:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 绿地率
                    LDL_15 = reFunction(
                        f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item
                    ) if reFunction(
                        f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);',
                        item) else reFunction(
                            f'绿地率(%)([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);',
                            item)
                    # 起始价(万元)
                    QSJ_17 = reFunction(
                        f'起始价为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item)
                    # 保证金(万元)
                    BZJ_18 = reFunction(
                        f'竞买保证金为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),',
                        item)
                    # 竞价幅度(万元)
                    JJFD_19 = reFunction(
                        f'竞价幅度为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)。', item)
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            else:
                if '宗地编号' in items and '地块基本情况' not in items:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                    ]:
                        # 宗地编号
                        ZDBH_6 = reFunction(
                            f'宗地编号:*\s*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 土地位置
                        TDWZ_7 = reFunction(
                            f'宗地坐落:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 出让面积(m2)
                        CRMJ_8 = reFunction(
                            f'宗地\s*总*面积:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 土地用途
                        TDYT_11 = reFunction(
                            f'土地用途[明细]*:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 岀让年限
                        CRNX_12 = reFunction(
                            f'出让年限:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 容积率
                        RJL_13 = reFunction(
                            f'容积率:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 建筑密度
                        JZMD_14 = reFunction(
                            f'建筑密度\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 绿地率
                        LDL_15 = reFunction(
                            f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item
                        ) if reFunction(
                            f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item
                        ) else reFunction(
                            f'绿地率(%)\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 建筑空间
                        JZKJ_16 = reFunction(
                            f'建筑限高\(米\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 起始价(万元)
                        QSJ_17 = reFunction(
                            f'起始价:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 保证金(万元)
                        BZJ_18 = reFunction(
                            f'保证金:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 竞价幅度(万元)
                        JJFD_19 = reFunction(
                            f'加价幅度:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 挂牌开始时间
                        GPRQ_21 = reFunction(
                            f'挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 挂牌截止时间
                        GPJZSJ_22 = reFunction(
                            f'挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 联系地址
                        LXDZ_24 = reFunction(
                            f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items).split('联')[0] if reFunction(
                                f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                items) else ''
                        # 联系人
                        LXR_25 = reFunction(
                            f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items
                        ).split('联')[0] if reFunction(
                            f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items) else ''
                        # 联系电话
                        LXDH_26 = reFunction(
                            f'联系电话:\s*([()\d\.:: \(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items)
                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_6 and TDYT_11:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    GGLX_1,
                                    WJBT_2,
                                    SJ_3,
                                    LY_4,
                                    ZWBT_5,
                                    ZDBH_6,
                                    TDWZ_7,
                                    CRMJ_8,
                                    LHYD_9,
                                    DLYD_10,
                                    TDYT_11,
                                    CRNX_12,
                                    RJL_13,
                                    JZMD_14,
                                    LDL_15,
                                    JZKJ_16,
                                    QSJ_17,
                                    BZJ_18,
                                    JJFD_19,
                                    BMRQ_20,
                                    GPRQ_21,
                                    GPJZSJ_22,
                                    BZJDZSJ_23,
                                    LXDZ_24,
                                    LXR_25,
                                    LXDH_26,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                elif '地块基本情况' in items:
                    # todo
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    for _ in range(len(list(tdData.values())[0])):
                        # 宗地编号
                        ZDBH_6 = tdData.get('编号')[_] if tdData.get(
                            '编号') else ''
                        # 土地位置
                        TDWZ_7 = tdData.get('地块位置')[_] if tdData.get(
                            '地块位置') else ''
                        # 出让面积(m2)
                        CRMJ_8 = tdData.get('土地面积(亩)')[_] if tdData.get(
                            '土地面积(亩)') else ''
                        # 土地用途
                        TDYT_11 = tdData.get('土地用途')[_] if tdData.get(
                            '土地用途') else ''
                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_6 and TDYT_11:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    GGLX_1,
                                    WJBT_2,
                                    SJ_3,
                                    LY_4,
                                    ZWBT_5,
                                    ZDBH_6,
                                    TDWZ_7,
                                    CRMJ_8,
                                    LHYD_9,
                                    DLYD_10,
                                    TDYT_11,
                                    CRNX_12,
                                    RJL_13,
                                    JZMD_14,
                                    LDL_15,
                                    JZKJ_16,
                                    QSJ_17,
                                    BZJ_18,
                                    JJFD_19,
                                    BMRQ_20,
                                    GPRQ_21,
                                    GPJZSJ_22,
                                    BZJDZSJ_23,
                                    LXDZ_24,
                                    LXR_25,
                                    LXDH_26,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
Ejemplo n.º 9
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            noticeDetail = 'https://www.sz68.com' + data.xpath(
                '//iframe[@id="externalframe1"]/@src').extract_first(
                ) if data.xpath(
                    '//iframe[@id="externalframe1"]/@src').extract_first(
                    ) else 'https://www.sz68.com' + data.xpath(
                        '//iframe[@id="externalframe0"]/@src').extract_first()

            ZWBT = ''
            GGQ = ''
            GPKSSJ = ''
            GPJSSJ = ''
            ZDDM_DKZDBH = ''
            ZDH = ''
            DKWZ = ''
            DKYT = ''
            ZRHYLB = ''
            TDMJ = ''
            JZMJ = ''
            TDSYNX = ''
            TDFZXZ = ''
            RJL = ''
            GPQSJ = ''
            JMBZJ = ''
            TDSYNX = ''
            ZBJJZSJ = ''
            BMSJ = ''
            BMDD = ''
            DZ = ''
            DH = ''
            JYSJ = response.meta.get('JYSJ')
            JYZT = response.meta.get('JYZT')
            ZDH = response.meta.get('ZDH')
            TDWZ = response.meta.get('TDWZ')
            QSJ = response.meta.get('QSJ')
            TDYT = response.meta.get('TDYT')
            TDMJ = response.meta.get('TDMJ')
            JYFS = response.meta.get('JYFS')
            id = response.meta.get('id')
            # 公告详情
            detailData = requests.get(noticeDetail,
                                      headers=self.header,
                                      allow_redirects=False,
                                      timeout=60,
                                      verify=False)

            if detailData.status_code == 200:
                detail = Selector(text=detailData.content.decode('utf-8'))
                items = str(detail.xpath('string(.)').extract()[0]).replace(
                    '\xa0', '').replace('\u3000',
                                        '').replace('\n', '').replace(' ', '')
                # 正文标题
                ZWBT = ''.join(
                    detail.xpath(
                        '/html/body/div/p[2]/span//text() | /html/body/p[2]/span//text()|/html/body/p[1]/span//text()'
                    ).extract())
                # 公告期
                GGQ = reFunction('公告期自([\w \-\s]*)[止]?,', items)
                # 挂牌开始时间
                GPKSSJ = reFunction(
                    '挂牌期自(\d{4}年\d{1,2}月\d{1,2}日)[起]?至(?:\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # 挂牌结束时间
                GPJSSJ = reFunction(
                    '挂牌期自(?:\d{4}年\d{1,2}月\d{1,2}日)[起]?至(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # TODO 解析页面表格
                soup = BeautifulSoup(detailData.text)
                table = soup.find('body').find('div').find(
                    'table') if soup.find('body').find('div').find(
                        'table') else soup.find('table')

                htmlTable = htmlTableTransformer()
                tdData = htmlTable.table_tr_td(table)
                # 宗地代码 / 地块宗地编号
                ZDDM_DKZDBH = tdData.get('宗地编号') if tdData.get(
                    '宗地编号') else tdData.get('地块宗地编号')
                # 宗地号
                ZDH_A = tdData.get('宗地号')
                # 土地位置
                DKWZ = tdData.get('土地位置')
                # 土地用途
                DKYT = tdData.get('土地用途')
                # 准入行业类别
                ZRHYLB = tdData.get('准入行业类别')
                # 土地面积 / 土地面积(平方米)
                TDMJ_A = tdData.get('土地面积(平方米)') if tdData.get(
                    '土地面积(平方米)') else tdData.get('土地面积')
                # 建筑面积(平方米) / 总建筑面积
                JZMJ = tdData.get('建筑面积(平方米)') if tdData.get(
                    '建筑面积(平方米)') else tdData.get('总建筑面积')
                # 挂牌起始价(人民币万元)
                GPQSJ = tdData.get('挂牌起始价(人民币、万元)')
                # 竞买(投标)保证金(人民币万元)
                JMBZJ = tdData.get('竞买(投标)保证金(人民币、万元)')
                # 土地使用年限(年)
                TDSYNX = tdData.get('土地使用年期')

                if not detail.xpath('//table').extract():
                    # 宗地代码 / 地块宗地编号
                    ZDDM_DKZDBH = reFunction('宗地编号([\w \-\s]*),', items)
                    # 土地使用年期 / 土地使用年限  情况2 中的 土地使用年期
                    TDSYNX = reFunction('土地使用年[\s期限]*[为]?(\d*年)', items)
                    # 土地发展建设现状
                    TDFZXZ = reFunction('土地的发展建设现状:([\S\s]*。)', items)
                    # 容积率  容积率不大于1.518。
                    RJL = reFunction('容积率[\D]*([\.\d]*)。', items)
                    # 土地位置  宗地位于龙岗 中心城14号地,
                    DKWZ = reFunction('宗地位于([\w \s]*),', items)
                    # 土地用途
                    DKYT = reFunction('土地用途为([\w \s]*),', items)
                    # TODO 是否需要在解析一种页面  http://localhost:63342/IntegrationSpider/Logs/dwsw.html?_ijt=rfnsd28r0fb132e6i5qkd3db6f
                # 保证金截止时间
                ZBJJZSJ = reFunction(
                    '保证金的到账截止时间为(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时\d{1,2}分)',
                    items)
                # 地址  //匹配这些中文标点符号 。 ? ! , 、 ; :

                DZ = '|'.join(
                    re.findall('地址:([\w \.\-\s\/\%,\(\)。 \? \!  、:]*);咨询电话',
                               items))
                # 电话
                DH = '|'.join(
                    re.findall('咨询电话:([\w \.\-\s\/\%,\(\)。 \? \!  、]*)[;。]',
                               items))
            else:
                raise IntegrationException(f'获取公告详情失败, url: {noticeDetail}')

            # TODO 基本信息  完成
            itemsData = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 交易方式
            JYFS_A = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[2]/span/text()'
            ).extract_first()
            # 交易类型
            JYLX = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[1]/span/text()'
            ).extract_first()
            # 宗地
            ZD = data.xpath('//div[@class="content_case1"]/div[1]/div/text()'
                            ).extract_first()
            # 发布时间
            FBSJ = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[2]/text()'
            ).extract_first()
            # 交易状态
            JYZT_A = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[3]/text()'
            ).extract_first()
            # 中标人 / 竞得人
            ZBR_24 = data.xpath(
                '//div[@class="right_first"]/div[1]/div[2]/text()'
            ).extract_first()
            # 成交价(元)
            CJJ_25 = data.xpath(
                '//div[@class="right_first"]/div[2]/div[2]/text()'
            ).extract_first()
            # 保证金(元)
            BZJ_26 = data.xpath(
                '//div[@class="right_first twin"][1]/div[1]/div[2]/text()'
            ).extract_first()
            # 起始价(元)
            QSJ_A = data.xpath(
                '//div[@class="right_first twin"][1]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞价阶梯(元)
            JJJT_28 = data.xpath(
                '//div[@class="right_first twin"][2]/div[1]/div[2]/text()'
            ).extract_first()
            # 封顶价(元)
            FDJ_29 = data.xpath(
                '//div[@class="right_first twin"][2]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞买申请截止时间
            JMSQJZSJ_30 = data.xpath(
                '//div[@class="right_first twin"][3]/div[1]/div[2]/text()'
            ).extract_first()
            # 竞买人数
            JMRS_31 = data.xpath(
                '//div[@class="right_first twin"][3]/div[2]/div[2]/text()'
            ).extract_first()

            # TODO 标的详情  完成
            BDdetail = data.xpath(
                '//li[@class="weather_info_ul_item"]/div[2]/span')
            # 宗地号
            ZDH_B = BDdetail[0].xpath('text()').extract_first()
            # 土地面积
            TDMJ_B = BDdetail[1].xpath('text()').extract_first()
            # 建筑面积
            JZMJ_A = BDdetail[2].xpath('text()').extract_first()
            # 容积率
            RJL_A = BDdetail[3].xpath('text()').extract_first()
            # 建筑覆盖率
            JZFGL = BDdetail[4].xpath('text()').extract_first()
            # 建筑高度
            JZGD = BDdetail[5].xpath('text()').extract_first()
            # 用途
            YT = BDdetail[6].xpath('text()').extract_first()
            # 使用年限
            SYNX = BDdetail[7].xpath('text()').extract_first()
            # 区域
            QY = BDdetail[8].xpath('text()').extract_first()
            # 位置
            WZ = BDdetail[9].xpath('text()').extract_first()
            # 绿地率
            LDL = BDdetail[10].xpath('text()').extract_first()
            # 建筑楼层
            JZLC = BDdetail[11].xpath('text()').extract_first()

            # TODO 竞价记录 完成
            # 竞买人
            JMR = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[2]/text()'
            ).extract_first()
            # 竞买出价(元)
            JMSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[3]/text()'
            ).extract_first()
            # 竞价时间
            CJSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[4]/text()'
            ).extract_first()
            # 状态
            ZT = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[5]/text()'
            ).extract_first()

            # TODO 结果公示 完成
            results = requests.post(
                'https://www.sz68.com/tiaim/web/resultdetailbytargetId',
                headers=self.header,
                data={'targetId': id},
                allow_redirects=False,
                timeout=60,
                verify=False)
            if results.status_code == 200:
                resultsData = results.json()
                # 正文标题
                ZWBT_A = resultsData.get('notice').get('NAME')
                # 发布日期
                FBRQ = resultsData.get('notice').get('PUBLISH_TIME')
                # 宗地号
                ZDH_C = resultsData.get('notice').get('DTL_REF_NO')
                # 竞得人
                JDR = reFunction('竞得人:([\w \.\-\s\/\%,]*)<',
                                 resultsData.get('fileExtName'))
                # 中标人
                ZBR_A = reFunction('中标人:([\w \.\-\s\/\%,]*)<',
                                   resultsData.get('fileExtName'))
                # 位置
                WZ = reFunction('位置:([\w \.\-\s\/\%,、]*)<',
                                resultsData.get('fileExtName'))
                # 土地用途
                TDYT_A = reFunction('土地用途:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 土地面积
                TDMJ_C = reFunction('土地面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 建筑面积
                JZMJ_B = reFunction('建筑面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 起始价
                QSJ_D = reFunction('起始价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 成交价
                CJJ_A = reFunction('成交价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 溢价率
                YJL = reFunction('溢价率:([\w \.\-\s\/\%,、]*)<',
                                 resultsData.get('fileExtName'))
                # 综合楼面单价
                ZHLMDJ = reFunction('综合楼面单价:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))

            # TODO  附件  解析出让合同  完成
            accessory = '土地模块|'
            links = data.xpath('//div[@class="accessory_link"]/a')
            for link in links:
                fileName = link.xpath(
                    'text()[position()=((position() mod 2)=0)]'
                ).extract_first().strip() if link.xpath(
                    'text()[position()=((position() mod 2)=0)]').extract_first(
                    ) else '未知名称'
                try:
                    href = link.xpath('@href').extract_first()
                    linkPath = self.dirName + f'土地模块_{ZDH}' + fileName
                    response = requests.get(href,
                                            headers=self.header,
                                            timeout=200)

                    with open(linkPath, 'wb') as fp:
                        fp.write(response.content)
                except:
                    pass
                else:
                    accessory += fileName + '|'
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            md5Mark = encrypt_md5(ZDH + WZ + ZWBT + url)
            csvFile = [
                JYSJ,
                JYZT,
                ZDH,
                TDWZ,
                QSJ,
                TDYT,
                TDMJ,
                JYFS_A,
                JYLX,
                ZD,
                FBSJ,
                JYZT_A,
                ZBR_24,
                CJJ_25,
                BZJ_26,
                QSJ_A,
                JJJT_28,
                FDJ_29,
                JMSQJZSJ_30,
                JMRS_31,
                ZWBT,
                GGQ,
                GPKSSJ,
                GPJSSJ,
                ZDDM_DKZDBH,
                ZDH_A,
                DKWZ,
                DKYT,
                ZRHYLB,
                TDMJ_A,
                JZMJ,
                TDSYNX,
                TDFZXZ,
                RJL,
                GPQSJ,
                JMBZJ,
                TDSYNX,
                ZBJJZSJ,
                DZ,
                DH,
                ZDH_B,
                TDMJ_B,
                JZMJ_B,
                RJL_A,
                JZFGL,
                JZGD,
                YT,
                SYNX,
                QY,
                WZ,
                LDL,
                JZLC,
                JMR,
                JMSJ,
                CJSJ,
                ZT,
                ZWBT_A,
                FBRQ,
                ZDH_C,
                JDR,
                ZBR_A,
                WZ,
                TDYT_A,
                TDMJ_C,
                JZMJ_B,
                QSJ_D,
                CJJ_A,
                YJL,
                ZHLMDJ,
                crawlingTime,
                url,
                md5Mark,
                accessory,
            ]
            fileData = []
            for _ in csvFile:
                try:
                    fileData.append(
                        _.replace(',', ' ').replace('\n',
                                                    '').replace('\r', ''))
                except:
                    fileData.append(str(_))
            self.fileDetail.write(','.join(fileData))
            self.fileDetail.write('\n')
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
Ejemplo n.º 10
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            # 数据获取不全
            categorynum = response.meta.get('categorynum')
            infoid = response.meta.get('infoid')
            targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum
            results = ''
            for _ in range(5):
                try:
                    self.session.get(targetUrl,
                                     headers=self.header,
                                     allow_redirects=False,
                                     timeout=60)
                    redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect'
                    data = {'categorynum': categorynum, 'infoid': infoid}
                    response_ = self.session.post(redirectUrl,
                                                  headers=self.header,
                                                  data=data,
                                                  allow_redirects=False,
                                                  timeout=60)
                    url = 'https://www.cqggzy.com' + response_.json().get(
                        'custom') if 'http' not in response_.json().get(
                            'custom') else response_.json().get('custom')
                    results = self.session.get(url,
                                               headers=self.header,
                                               allow_redirects=False,
                                               timeout=60)
                    break
                except Exception as e:
                    pass
            data = Selector(text=results.content.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            GGXH_31 = ''
            ZDBH_32 = ''
            DKWZ_33 = ''
            TDYT_34 = ''
            TDMJ_35 = ''
            RJL_36 = ''
            JZMD_37 = ''
            LDL_38 = ''
            BZJ_39 = ''
            DJ_40 = ''
            JRZJMJ_41 = ''
            CRFS_42 = ''
            CRNX_43 = ''
            CJJ_44 = ''
            SRDW_45 = ''
            TDSYTJ_46 = ''
            JYSJ_47 = ''
            CJR_48 = ''
            BZ_49 = ''
            LXDW_50 = ''
            LXDZ_51 = ''
            LXDH_52 = ''
            GSQ_53 = ''

            # 共有字段
            # 文件标题
            WJBT_27 = data.xpath(
                '//*[@class="article-title"]/text()').extract_first()
            # 信息时间
            XXSJ_28 = reFunction(
                '(\d{4}-\d{1,2}-\d{1,2})',
                data.xpath(
                    '//*[@class="info-source"]/text()[1]').extract_first())
            # TODO
            # 正文标题
            ZWBT_29 = WJBT_27
            soup = BeautifulSoup(results.content.decode('utf-8'))
            table = soup.find('table')

            if '土地使用条件' in items or '宗地编号' in items or '公告序号' in items:
                # TODO 正则匹配的页面
                # 公告序号
                GGXH_31 = '|'.join(
                    re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                               items))
                # 宗地编号 / 编号
                ZDBH_32_ = '|'.join(
                    re.findall(
                        f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)',
                        items))
                ZDBH_32 = ZDBH_32_.replace(':', '') if ZDBH_32_ else ZDBH_32_
                # 地块位置
                DKWZ_33_ = '|'.join(
                    re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)', items))
                DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join(
                    re.findall(
                        '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 土地用途 / 用途
                TDYT_34 = '|'.join(
                    re.findall('[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                               items))
                # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                TDMJ_35_ = '|'.join(
                    re.findall(
                        '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)',
                        items))
                TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join(
                    re.findall(
                        f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)',
                        items))
                # 容积率
                RJL_36 = '|'.join(
                    re.findall('容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 计容建筑面积(m2)
                JRZJMJ_41 = '|'.join(
                    re.findall(f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)',
                               items))
                # 出让方式
                CRFS_42 = '|'.join(
                    re.findall(
                        '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 出让年限
                CRNX_43 = '|'.join(
                    re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)', items))
                # 成交价(万元) / 成交价
                CJJ_44_ = '|'.join(
                    re.findall(
                        f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)',
                        items))
                CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join(
                    re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items))
                # 受让单位
                SRDW_45 = '|'.join(
                    re.findall('受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 土地使用条件
                TDSYTJ_46 = '|'.join(
                    re.findall(
                        '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 交易时间
                JYSJ_47 = '|'.join(
                    re.findall('交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 成交人
                CJR_48 = '|'.join(
                    re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items))
                # 备注
                BZ_49 = '|'.join(
                    re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items))
                # 联系地址
                LXDZ_51 = '|'.join(
                    re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)',
                               items))
                # 联系电话
                LXDH_52 = '|'.join(
                    re.findall('联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 公示期
                GSQ_53 = '|'.join(
                    re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)',
                               items))
            else:
                if not table:
                    # TODO 正则匹配的页面
                    # 公告序号
                    GGXH_31 = '|'.join(
                        re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                                   items))
                    # 宗地编号 / 编号
                    ZDBH_32_ = '|'.join(
                        re.findall(
                            f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)',
                            items))
                    ZDBH_32 = ZDBH_32_.replace(':',
                                               '') if ZDBH_32_ else ZDBH_32_
                    # 地块位置
                    DKWZ_33_ = '|'.join(
                        re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)',
                                   items))
                    DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join(
                        re.findall(
                            '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 土地用途 / 用途
                    TDYT_34 = '|'.join(
                        re.findall(
                            '[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                            items))
                    # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                    TDMJ_35_ = '|'.join(
                        re.findall(
                            '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)',
                            items))
                    TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join(
                        re.findall(
                            f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    # 容积率
                    RJL_36 = '|'.join(
                        re.findall(
                            '容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 计容建筑面积(m2)
                    JRZJMJ_41 = '|'.join(
                        re.findall(
                            f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    # 出让方式
                    CRFS_42 = '|'.join(
                        re.findall(
                            '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 出让年限
                    CRNX_43 = '|'.join(
                        re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)',
                                   items))
                    # 成交价(万元) / 成交价
                    CJJ_44_ = '|'.join(
                        re.findall(
                            f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join(
                        re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items))
                    # 受让单位
                    SRDW_45 = '|'.join(
                        re.findall(
                            '受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 土地使用条件
                    TDSYTJ_46 = '|'.join(
                        re.findall(
                            '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 交易时间
                    JYSJ_47 = '|'.join(
                        re.findall(
                            '交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 成交人
                    CJR_48 = '|'.join(
                        re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items))
                    # 备注
                    BZ_49 = '|'.join(
                        re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items))
                    # 联系地址
                    LXDZ_51 = '|'.join(
                        re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)',
                                   items))
                    # 联系电话
                    LXDH_52 = '|'.join(
                        re.findall(
                            '联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 公示期
                    GSQ_53 = '|'.join(
                        re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)',
                                   items))
                else:
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号 / 编号
                    ZDBH_32 = tdData.get('编号') if tdData.get(
                        '编号') else tdData.get('地块编号')
                    # 地块位置
                    DKWZ_33 = tdData.get('地块位置')
                    # 土地用途 / 用途
                    TDYT_34 = tdData.get('用途') if tdData.get(
                        '用途') else tdData.get('土地用途')
                    # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                    TDMJ_35_ = tdData.get('地块面积(平方米)') if tdData.get(
                        '地块面积(平方米)') else tdData.get('地块面积(㎡)')
                    TDMJ_35 = TDMJ_35_ if TDMJ_35_ else tdData.get('宗地面积(平方米)')
                    # 出让方式
                    CRFS_42 = tdData.get('出让方式')
                    # 容积率
                    RJL_36 = tdData.get('容积率')
                    # 建筑密度( %)
                    JZMD_37 = tdData.get('建筑密度(%)')
                    # 绿地率( %)
                    LDL_38 = tdData.get('绿地率(%)')
                    # 底价(万元)
                    DJ_40 = tdData.get('底价(万元)')
                    # 保证金(万元)
                    BZJ_39 = tdData.get('保证金(万元)')
                    # 出让年限
                    CRNX_43 = tdData.get('出让年限')
                    # 成交价(万元) / 成交价
                    CJJ_44 = tdData.get('成交价(万元)') if tdData.get(
                        '成交价(万元)') else tdData.get('成交价格(万元)')
                    # 成交人
                    CJR_48 = tdData.get('成交人')
                    # 备注
                    BZ_49 = tdData.get('备注')
                    # 公示期
                    GSQ_53 = reFunction(
                        f'公示期:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系单位
                    LXDW_50 = reFunction(
                        f'联 系 人:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系地址
                    LXDZ_51 = reFunction(
                        f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系电话
                    LXDH_52 = reFunction(
                        f'联系电话:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = url if url else response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + ZDBH_32 + DKWZ_33)

            # 存储数据
            csvFile = [
                WJBT_27,
                XXSJ_28,
                ZWBT_29,
                GGXH_31,
                ZDBH_32,
                DKWZ_33,
                TDYT_34,
                TDMJ_35,
                RJL_36,
                JZMD_37,
                LDL_38,
                BZJ_39,
                DJ_40,
                JRZJMJ_41,
                CRFS_42,
                CRNX_43,
                CJJ_44,
                SRDW_45,
                TDSYTJ_46,
                JYSJ_47,
                CJR_48,
                BZ_49,
                LXDW_50,
                LXDZ_51,
                LXDH_52,
                GSQ_53,
                crawlingTime,
                url,
                md5Mark,
            ]
            self.fileDetail.write(','.join([
                _.replace(',', ' ').replace('\n', '').replace('\r', '')
                if _ else _ for _ in csvFile
            ]))
            self.fileDetail.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
Ejemplo n.º 11
0
response = requests.get(url, headers=header, verify=False)

data = Selector(text=response.content.decode('utf-8'))
items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace(
    '\u3000', '')

reStr = '()\w\.:: 。 \(\)〔〕㎡≤;,≥《》\-\/\%,、\.﹪㎡'
# TODO 共有字段

# TODO //table[@border="1"]   //table[@border="0"]
# table 解析
if '宗地编号' not in items and '配套建筑规划用地' not in items:
    if data.xpath('//table[@border="0"]') and '主要规划指标' not in items:
        soup = BeautifulSoup(response.content.decode('utf-8'))
        table = soup.find('table')
        htmlTable = htmlTableTransformer()
        tdData = htmlTable.tableTrTdRegulation(table)
        # 宗地编号
        ZDBH_4 = tdData.get('地块编号')
        # 宗地坐落
        ZDZL_5 = tdData.get('土地位置')
        # 面积
        MJ_6 = tdData.get('土地面积(平方米)')
        # 土地用途
        TDYT_7 = tdData.get('土地用途')
        # 出让年限
        CRNX_8 = tdData.get('出让年限(年)') if tdData.get(
            '出让年限(年)') else tdData.get('出让年限')
        # 容积率
        RJL_9 = tdData.get('容积率')
        # 绿地率
Ejemplo n.º 12
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '')
            htmlTable = htmlTableTransformer()
            WJBT_1 = ''
            XXSJ_2 = ''
            WBT_3 = ''
            GGBH_4 = ''
            CRSJ_5 = ''
            GGNX_6 = ''
            ZDBH_7 = ''
            DKWZ_8 = ''
            ZDWZ_9 = ''
            ZDZL_10 = ''
            TDYT_11 = ''
            GHTDYT_12 = ''
            CRNX_13 = ''
            SYNX_14 = ''
            PZJGJWH_15 = ''
            GHYDMJ_16 = ''
            GHMJ_17 = ''
            CRMJ_18 = ''
            CRYDMJ_19 = ''
            ZDCRMJ_20 = ''
            JZMD_21 = ''
            RJL_22 = ''
            LDL_23 = ''
            LDL_24 = ''
            JZKZGD_25 = ''
            JZKZZGD_26 = ''
            JZXS_27 = ''
            TZQD_28 = ''
            TDGJBAH_29 = ''
            SFSZD_30 = ''
            TDXZTJ_31 = ''
            JMBZJ_32 = ''
            JMBZJ_72 = ''
            QJJ_33 = ''
            CRQSJ_34 = ''
            JJFD_35 = ''
            SFSZBLJ_36 = ''
            GPKSSJ_37 = ''
            GPJZSJ_38 = ''
            HQCRWJSJ_39 = ''
            TJJMSQSJ_40 = ''
            BZJJZSJ_41 = ''
            QRJMZGSJ_42 = ''
            LXDZ_43 = ''
            LXDH_44 = ''
            LXR_45 = ''
            BZJZH_86 = ''
            BZJZH_87 = ''
            BZJZH_88 = ''
            CRJZH_97 = ''
            CRJZH_98 = ''
            CRJZH_99 = ''

            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_1 = response.meta.get('title').strip()
            # 信息时间
            XXSJ_2 = reFunction('[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first())
            # 正文标题
            WBT_3 = WJBT_1
            # 公告编号
            GGBH_4 = ''.join(data.xpath('//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()').extract())
            # 出让时间
            CRSJ_5 = reFunction('定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items)
            # 公告类型
            GGNX_6 = '出让公告'
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_1 + XXSJ_2)

            GPSJ_0 = reFunction('挂牌交易期限:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items)
            GPSJ_1 = reFunction('申请人可于:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)到', items)
            GPSJ = GPSJ_0 if GPSJ_0 else GPSJ_1
            # 挂牌开始时间、
            GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 挂牌截止时间、
            GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            if GPSJ:
                try:
                    GPKSSJ_37 = GPSJ.split('至')[0]
                    GPJZSJ_38 = GPSJ.split('至')[1]
                except:
                    pass
            # 获取出让文件时间、
            HQCRWJSJ_39 = GPSJ_1
            # 提交竞买申请时间、
            TJJMSQSJ_40 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 保证金截止时间、
            BZJJZSJ_41 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 确认竞买资格时间
            QRJMZGSJ_42 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 联系地址、
            LXDZ_43 = reFunction('联系地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话、
            LXDH_44 = reFunction('联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人、
            LXR_45 = reFunction('联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

            ZH_0 = reFunction('以下账户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪\s]*)[一二三四五六七八九123456789]*', items)
            ZH_1 = reFunction('保证金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items)
            try:
                if ZH_0:
                    if ZH_0[:2] == '户名':
                        result = re.split('[①②③④]*', ZH_0)
                        # 保证金账户开户单位 / 户名
                        BZJZH_86 = result[0].replace('户名:','') if result[0] else ''
                        # 保证金账户账号
                        BZJZH_87 = '|'.join([re.split(',|,', _)[0] for _ in result[1:]])
                        # 保证金账户开户行
                        BZJZH_88 = '|'.join([re.split(',|,', _)[-1] for _ in result[1:]])
                    else:
                        result = re.split('[①②③④]*', ZH_0)
                        # 保证金账户开户单位 / 户名
                        BZJZH_86 = '|'.join([re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                        # 保证金账户账号
                        BZJZH_87 = '|'.join([re.findall('户\s*名:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                        # 保证金账户开户行
                        BZJZH_88 = '|'.join([re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                elif ZH_1:
                    # 保证金账户开户单位 / 户名
                    BZJZH_86 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
                    # 保证金账户账号
                    BZJZH_87 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
                    # 保证金账户开户行
                    BZJZH_88 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
            except:
                pass
            CR = reFunction('出让金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items)
            try:
                # 出让金账户开户单位 / 户名
                CRJZH_97 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
                # 出让金账户开户行
                CRJZH_98 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
                # 出让金账户账号
                CRJZH_99 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
            except:
                pass

            if '拍卖出让地块的基本情况和规划指标要求' not in items and '备注' not in items and '挂牌出让地块的基本情况和规划指标要求' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    tables = soup.find_all('table')
                    if '规划用途及主要指标' in items:  # 处理费标准的表格
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace = table.tbody.find_all('tr')[0].find('td', colspan='4')
                        number = table.tbody.find_all('tr')[0].index(tdReplace)
                        tdList = table.tbody.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.tbody.find_all('tr')[0].insert(number + _, tdList[_ - 1])
                        tdReplace.extract()
                        [_.extract() for _ in table.tbody.find_all('tr')[1].find_all('td')]
                        table.tbody.find_all('tr')[1].extract()

                        tdData = htmlTable.tableTrTdChangeToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_7 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else ''
                            # 出让面积(m2)
                            CRMJ_18 = tdData.get('土地面积(㎡)')[_] if tdData.get('土地面积(㎡)') else ''
                            # 容积率
                            RJL_22 = tdData.get('容积率')[_] if tdData.get('容积率') else ''
                            # 绿地率( %)
                            LDL_24 = tdData.get('绿地率(%)')[_] if tdData.get('绿地率(%)') else ''
                            # 建筑系数( %)
                            JZXS_27 = tdData.get('建筑系数(%)')[_] if tdData.get('建筑系数(%)') else ''
                            # 竟买保证金(万元)
                            JMBZJ_72 = tdData.get('竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else ''
                            # 出让起始价(万元)
                            CRQSJ_34 = tdData.get('挂牌出让起始价(万元)')[_] if tdData.get('挂牌出让起始价(万元)') else ''
                            # 加价幅度、
                            JJFD_35 = tdData.get('加价幅度')[_] if tdData.get('加价幅度') else ''

                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if ZDBH_7:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_1,
                                        XXSJ_2,
                                        WBT_3,
                                        GGBH_4,
                                        CRSJ_5,
                                        GGNX_6,
                                        ZDBH_7,
                                        DKWZ_8,
                                        ZDWZ_9,
                                        ZDZL_10,
                                        TDYT_11,
                                        GHTDYT_12,
                                        CRNX_13,
                                        SYNX_14,
                                        PZJGJWH_15,
                                        GHYDMJ_16,
                                        GHMJ_17,
                                        CRMJ_18,
                                        CRYDMJ_19,
                                        ZDCRMJ_20,
                                        JZMD_21,
                                        RJL_22,
                                        LDL_23,
                                        LDL_24,
                                        JZKZGD_25,
                                        JZKZZGD_26,
                                        JZXS_27,
                                        TZQD_28,
                                        TDGJBAH_29,
                                        SFSZD_30,
                                        TDXZTJ_31,
                                        JMBZJ_32,
                                        JMBZJ_72,
                                        QJJ_33,
                                        CRQSJ_34,
                                        JJFD_35,
                                        SFSZBLJ_36,
                                        GPKSSJ_37,
                                        GPJZSJ_38,
                                        HQCRWJSJ_39,
                                        TJJMSQSJ_40,
                                        BZJJZSJ_41,
                                        QRJMZGSJ_42,
                                        LXDZ_43,
                                        LXDH_44,
                                        LXR_45,
                                        BZJZH_86,
                                        BZJZH_87,
                                        BZJZH_88,
                                        CRJZH_97,
                                        CRJZH_98,
                                        CRJZH_99,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                    '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                     level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(self,
                                                                 'response msg info %s, job duplicated!' % response.url)
                    elif len(tables) <= 3:
                        tdsList = {}
                        for table in tables:
                            td = htmlTable.tableTrTdRegulationToList(table)
                            tdsList.update(td)
                        for _ in range(len(list(tdsList.values())[0])):
                            # 宗地编号
                            ZDBH_7 = tdsList.get('宗地编号')[_] if tdsList.get('宗地编号') else ''
                            # 地块编号  地块名称
                            DKWZ_8 = tdsList.get('地块编号')[_] if tdsList.get('地块编号') else tdsList.get('地块编号')[_] if tdsList.get('地块编号') else ''
                            # 宗地位置
                            ZDWZ_9 = tdsList.get('宗地位置')[_] if tdsList.get('宗地位置') else ''
                            # 宗地坐落
                            ZDZL_10 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else ''
                            # 土地用途
                            TDYT_11 = tdsList.get('土地用途')[_] if tdsList.get('土地用途') else ''
                            # 规划土地用途
                            GHTDYT_12 = tdsList.get('规划土地用途')[_] if tdsList.get('规划土地用途') else ''
                            # 出让年限
                            CRNX_13 = tdsList.get('出让年限')[_] if tdsList.get('出让年限') else ''
                            # 使用年限
                            SYNX_14 = tdsList.get('使用年限')[_] if tdsList.get('使用年限') else ''
                            # 批准机关及文号
                            PZJGJWH_15 = tdsList.get('批准机关及文号')[_] if tdsList.get('批准机关及文号') else tdsList.get('批准文号')[_] if tdsList.get('批准文号') else ''
                            # 规划用地面积〔m2)
                            GHYDMJ_16 = tdsList.get('规划用地面积(m2)')[_] if tdsList.get('规划用地面积(m2)') else tdsList.get('用地面积(㎡)')[_] if tdsList.get('用地面积(㎡)') else tdsList.get('规划用地面积(㎡)')[_] if tdsList.get('规划用地面积(㎡)') else ''
                            # 出让面积(m2)
                            CRMJ_18 = tdsList.get('出让面积(㎡)')[_] if tdsList.get('出让面积(㎡)') else ''
                            # 规划面积(m2)
                            GHMJ_17 = tdsList.get('规划面积(㎡)')[_] if tdsList.get('规划面积(㎡)') else ''
                            # 出让用地面积(m2)
                            CRYDMJ_19 = tdsList.get('出让用地面积(m2)')[_] if tdsList.get('出让用地面积(m2)') else ''
                            # 宗地出让面积
                            ZDCRMJ_20 = tdsList.get('宗地出让面积')[_] if tdsList.get('宗地出让面积') else ''
                            # 建筑密度
                            JZMD_21 = tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else ''
                            # 容积率
                            RJL_22 = tdsList.get('容积率')[_] if tdsList.get('容积率') else ''
                            # 绿地率
                            LDL_23 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else ''
                            # 绿地率( %)
                            LDL_24 = tdsList.get('绿地率')[_] if tdsList.get('绿地率') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else ''
                            # 建筑控制高度(m)
                            JZKZGD_25 = tdsList.get('建筑控制高度(m)')[_] if tdsList.get('建筑控制高度(m)') else ''
                            # 建筑控制高度(米)
                            JZKZZGD_26 = tdsList.get('建筑控制高度(米)')[_] if tdsList.get('建筑控制高度(米)') else ''
                            # 投资强度(万元 / 公顷)
                            TZQD_28 = tdsList.get('投资强度(万元/公顷)')[_] if tdsList.get('投资强度(万元/公顷)') else ''
                            # 竞买保证金
                            JMBZJ_32 = tdsList.get('竞买保证金')[_] if tdsList.get('竞买保证金') else ''
                            # 出让起始价(万元)
                            CRQSJ_34 = tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else ''
                            # 竟买保证金(万元)
                            JMBZJ_72 = tdsList.get('竞买保证金(万元)')[_] if tdsList.get('竞买保证金(万元)') else ''
                            # 起叫价
                            QJJ_33 = tdsList.get('起始价')[_] if tdsList.get('起始价') else tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else ''
                            # 加价幅度
                            JJFD_35 = tdsList.get('加价幅度')[_] if tdsList.get('加价幅度') else ''

                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if ZDBH_7:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_1,
                                        XXSJ_2,
                                        WBT_3,
                                        GGBH_4,
                                        CRSJ_5,
                                        GGNX_6,
                                        ZDBH_7,
                                        DKWZ_8,
                                        ZDWZ_9,
                                        ZDZL_10,
                                        TDYT_11,
                                        GHTDYT_12,
                                        CRNX_13,
                                        SYNX_14,
                                        PZJGJWH_15,
                                        GHYDMJ_16,
                                        GHMJ_17,
                                        CRMJ_18,
                                        CRYDMJ_19,
                                        ZDCRMJ_20,
                                        JZMD_21,
                                        RJL_22,
                                        LDL_23,
                                        LDL_24,
                                        JZKZGD_25,
                                        JZKZZGD_26,
                                        JZXS_27,
                                        TZQD_28,
                                        TDGJBAH_29,
                                        SFSZD_30,
                                        TDXZTJ_31,
                                        JMBZJ_32,
                                        JMBZJ_72,
                                        QJJ_33,
                                        CRQSJ_34,
                                        JJFD_35,
                                        SFSZBLJ_36,
                                        GPKSSJ_37,
                                        GPJZSJ_38,
                                        HQCRWJSJ_39,
                                        TJJMSQSJ_40,
                                        BZJJZSJ_41,
                                        QRJMZGSJ_42,
                                        LXDZ_43,
                                        LXDH_44,
                                        LXR_45,
                                        BZJZH_86,
                                        BZJZH_87,
                                        BZJZH_88,
                                        CRJZH_97,
                                        CRJZH_98,
                                        CRJZH_99,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                    '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                     level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(self,
                                                                 'response msg info %s, job duplicated!' % response.url)

                    elif len(tables) == 6:
                        # TODO
                        pass
                except:
                    for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]:
                        # 宗地编号
                        ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '')
                        # 宗地坐落
                        ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 土地用途
                        TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 出让年限
                        CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 宗地出让面积
                        ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 建筑密度
                        JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 容积率
                        RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 绿地率( %)
                        LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 建筑控制高度(米)
                        JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 投资强度(万元 / 公顷)
                        TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 土地估价备案号
                        TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 现状土地条件
                        TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 竞买保证金
                        JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 起叫价
                        QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 加价幅度
                        JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 挂牌开始时间、
                        GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                        # 挂牌截止时间、
                        GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                        if self.name in DUPLICATE_SWITCH_LIST:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_7:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    WJBT_1,
                                    XXSJ_2,
                                    WBT_3,
                                    GGBH_4,
                                    CRSJ_5,
                                    GGNX_6,
                                    ZDBH_7,
                                    DKWZ_8,
                                    ZDWZ_9,
                                    ZDZL_10,
                                    TDYT_11,
                                    GHTDYT_12,
                                    CRNX_13,
                                    SYNX_14,
                                    PZJGJWH_15,
                                    GHYDMJ_16,
                                    GHMJ_17,
                                    CRMJ_18,
                                    CRYDMJ_19,
                                    ZDCRMJ_20,
                                    JZMD_21,
                                    RJL_22,
                                    LDL_23,
                                    LDL_24,
                                    JZKZGD_25,
                                    JZKZZGD_26,
                                    JZXS_27,
                                    TZQD_28,
                                    TDGJBAH_29,
                                    SFSZD_30,
                                    TDXZTJ_31,
                                    JMBZJ_32,
                                    JMBZJ_72,
                                    QJJ_33,
                                    CRQSJ_34,
                                    JJFD_35,
                                    SFSZBLJ_36,
                                    GPKSSJ_37,
                                    GPJZSJ_38,
                                    HQCRWJSJ_39,
                                    TJJMSQSJ_40,
                                    BZJJZSJ_41,
                                    QRJMZGSJ_42,
                                    LXDZ_43,
                                    LXDH_44,
                                    LXR_45,
                                    BZJZH_86,
                                    BZJZH_87,
                                    BZJZH_88,
                                    CRJZH_97,
                                    CRJZH_98,
                                    CRJZH_99,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                '\r',
                                                '').replace(
                                                r'\xa0', '').replace('\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                 level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)
            else:
                for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]:
                    # 宗地编号
                    ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '')
                    # 宗地坐落
                    ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 土地用途
                    TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 出让年限
                    CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 宗地出让面积
                    ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 建筑密度
                    JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 容积率
                    RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 绿地率( %)
                    LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 建筑控制高度(米)
                    JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 投资强度(万元 / 公顷)
                    TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 土地估价备案号
                    TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 现状土地条件
                    TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 竞买保证金
                    JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 起叫价
                    QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 加价幅度
                    JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 挂牌开始时间、
                    GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                    # 挂牌截止时间、
                    GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_7:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_1,
                                XXSJ_2,
                                WBT_3,
                                GGBH_4,
                                CRSJ_5,
                                GGNX_6,
                                ZDBH_7,
                                DKWZ_8,
                                ZDWZ_9,
                                ZDZL_10,
                                TDYT_11,
                                GHTDYT_12,
                                CRNX_13,
                                SYNX_14,
                                PZJGJWH_15,
                                GHYDMJ_16,
                                GHMJ_17,
                                CRMJ_18,
                                CRYDMJ_19,
                                ZDCRMJ_20,
                                JZMD_21,
                                RJL_22,
                                LDL_23,
                                LDL_24,
                                JZKZGD_25,
                                JZKZZGD_26,
                                JZXS_27,
                                TZQD_28,
                                TDGJBAH_29,
                                SFSZD_30,
                                TDXZTJ_31,
                                JMBZJ_32,
                                JMBZJ_72,
                                QJJ_33,
                                CRQSJ_34,
                                JJFD_35,
                                SFSZBLJ_36,
                                GPKSSJ_37,
                                GPJZSJ_38,
                                HQCRWJSJ_39,
                                TJJMSQSJ_40,
                                BZJJZSJ_41,
                                QRJMZGSJ_42,
                                LXDZ_43,
                                LXDH_44,
                                LXR_45,
                                BZJZH_86,
                                BZJZH_87,
                                BZJZH_88,
                                CRJZH_97,
                                CRJZH_98,
                                CRJZH_99,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                                   '').replace(
                                            r'\xa0', '').replace('\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                             level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
Ejemplo n.º 13
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            htmlTable = htmlTableTransformer()
            WJBT_48 = ''
            XXSJ_49 = ''
            ZWBT_50 = ''
            GGBH_51 = ''
            CRSJ_52 = ''
            GGNX_53 = ''
            DKBH_54 = ''
            DKWZ_55 = ''
            TDYT_56 = ''
            TDMJ_57 = ''
            CRNX_58 = ''
            CJJ_59 = ''
            SRDW_60 = ''
            TDXZTJ_61 = ''
            TDSYTJ_62 = ''
            BZ_63 = ''
            GSQ_64 = ''
            LXFS_65 = ''
            DWDZ_66 = ''
            YZBM_67 = ''
            LXDH_68 = ''
            LXR_69 = ''
            LXDW_77 = ''
            DZYJ_70 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_48 = response.meta.get('title').strip()
            # 信息时间
            XXSJ_49 = reFunction(
                '[\d\-]*',
                data.xpath('//p[@class="sub-cp"]/text()').extract_first())
            # 正文标题
            ZWBT_50 = WJBT_48
            # 公告编号
            GGBH_51 = ''.join(
                data.xpath(
                    '//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()'
                ).extract())
            # 出让时间
            CRSJ_52 = reFunction(
                '定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items)
            # 公告类型
            GGNX_53 = '出让结果'

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_48 + XXSJ_49)

            # 公示期
            GSQ_64 = reFunction(
                '公示期:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items)
            # 联系方式
            # LXFS_65
            # 联系单位
            LXDW_77 = reFunction(
                '联系单位:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_66 = reFunction(
                '单位地址:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_67 = reFunction(
                '邮政编码:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_68 = reFunction(
                '联系电话:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人
            LXR_69 = reFunction(
                '联\s*系\s*人:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 电子邮件
            DZYJ_70 = reFunction(
                '电子邮件:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》@\-\/\%,;,、\.﹪]*)\s', items)

            if '宗地编号' in items or '土地位置' in items:
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                tdData = htmlTable.tableTrTdRegulationToList(table)
                for _ in range(len(list(tdData.values())[0])):
                    # 地块编号
                    DKBH_54 = tdData.get('宗地编号')[_] if tdData.get(
                        '宗地编号') else ''
                    # 地块位置
                    DKWZ_55 = tdData.get('宗地位置')[_] if tdData.get(
                        '宗地位置') else tdData.get('土地位置')[_] if tdData.get(
                            '土地位置') else ''
                    # 土地用途
                    TDYT_56 = tdData.get('土地用途')[_] if tdData.get(
                        '土地用途') else tdData.get('规划土地用途')[_] if tdData.get(
                            '规划土地用途') else ''
                    # 土地面积(公顷)
                    TDMJ_57 = tdData.get('土地面积(m2)')[_] if tdData.get(
                        '土地面积(m2)') else tdData.get(
                            '出让土地面积(㎡)')[_] if tdData.get('出让土地面积(㎡)') else ''
                    # 出让年限
                    CRNX_58 = tdData.get('使用年限')[_] if tdData.get(
                        '使用年限') else tdData.get('出让年限')[_] if tdData.get(
                            '出让年限') else ''
                    # 成交价(万元)
                    CJJ_59 = tdData.get('成交价(万元)')[_] if tdData.get(
                        '成交价(万元)') else tdData.get(
                            '成交价(人民币)')[_] if tdData.get('成交价(人民币)') else ''
                    # 受让单位
                    SRDW_60 = tdData.get('受让单位')[_] if tdData.get(
                        '受让单位') else tdData.get('竞买人(单位)')[_] if tdData.get(
                            '竞买人(单位)') else ''
                    # 土地使用条件
                    TDSYTJ_62 = tdData.get('土地使用条件')[_] if tdData.get(
                        '土地使用条件') else ''

                    # 数据写入
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if DKWZ_55:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_48,
                                XXSJ_49,
                                ZWBT_50,
                                GGBH_51,
                                CRSJ_52,
                                GGNX_53,
                                DKBH_54,
                                DKWZ_55,
                                TDYT_56,
                                TDMJ_57,
                                CRNX_58,
                                CJJ_59,
                                SRDW_60,
                                TDXZTJ_61,
                                TDSYTJ_62,
                                BZ_63,
                                GSQ_64,
                                LXFS_65,
                                DWDZ_66,
                                YZBM_67,
                                LXDH_68,
                                LXR_69,
                                LXDW_77,
                                DZYJ_70,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '地块编号' in items:
                for item in [
                        '地块编号' + _ for _ in re.findall('一([\s\S]*)二、', items)
                    [0].split('地块编号')[1:]
                ]:
                    # 地块编号
                    DKBH_54 = reFunction('地块编号:*\s*([\w\-]*)\s', item)
                    # 地块位置
                    DKWZ_55 = reFunction(
                        '地块位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地用途
                    TDYT_56 = reFunction(
                        '土地用途:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地面积(公顷)
                    TDMJ_57 = reFunction(
                        '土地面积\(公顷\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 出让年限
                    CRNX_58 = reFunction(
                        '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 成交价(万元)
                    CJJ_59 = reFunction(
                        '成交价\(万元\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 受让单位
                    SRDW_60 = reFunction(
                        '受让单位:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地现状
                    TDXZTJ_61 = reFunction(
                        '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地使用条件
                    TDSYTJ_62 = reFunction(
                        '土地使用条件:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 备注
                    BZ_63 = reFunction(
                        '备注:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)

                    # 数据写入
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if DKWZ_55:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_48,
                                XXSJ_49,
                                ZWBT_50,
                                GGBH_51,
                                CRSJ_52,
                                GGNX_53,
                                DKBH_54,
                                DKWZ_55,
                                TDYT_56,
                                TDMJ_57,
                                CRNX_58,
                                CJJ_59,
                                SRDW_60,
                                TDXZTJ_61,
                                TDSYTJ_62,
                                BZ_63,
                                GSQ_64,
                                LXFS_65,
                                DWDZ_66,
                                YZBM_67,
                                LXDH_68,
                                LXR_69,
                                LXDW_77,
                                DZYJ_70,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            else:
                # 地块位置
                DKWZ_55 = reFunction(
                    '地理位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 出让年限
                CRNX_58 = reFunction(
                    '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 成交价(万元)
                CJJ_59 = reFunction(
                    '成交价格(人民币):*\s*([()\w\.::—\¥ (\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 受让单位
                SRDW_60 = reFunction(
                    '竞得人名称:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地现状
                TDXZTJ_61 = reFunction(
                    '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                # 数据写入
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if DKWZ_55:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_48,
                            XXSJ_49,
                            ZWBT_50,
                            GGBH_51,
                            CRSJ_52,
                            GGNX_53,
                            DKBH_54,
                            DKWZ_55,
                            TDYT_56,
                            TDMJ_57,
                            CRNX_58,
                            CJJ_59,
                            SRDW_60,
                            TDXZTJ_61,
                            TDSYTJ_62,
                            BZ_63,
                            GSQ_64,
                            LXFS_65,
                            DWDZ_66,
                            YZBM_67,
                            LXDH_68,
                            LXR_69,
                            LXDW_77,
                            DZYJ_70,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
Ejemplo n.º 14
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_45 = ''
            SJ_46 = ''
            LY_47 = ''
            ZWBT_48 = ''
            DKBH_49 = ''
            ZDBH_50 = ''
            PMJG_51 = ''
            GGZRFS_52 = ''
            GPSJ_53 = ''
            ZRR_54 = ''
            ZRF_55 = ''
            SRR_56 = ''
            SRF_57 = ''
            SRDW_58 = ''
            WZ_59 = ''
            DKWZ_60 = ''
            CRMJ_61 = ''
            YT_62 = ''
            CJJ_63 = ''
            BDCQDJH_64 = ''
            CRHTBH_65 = ''
            CRHT_66 = ''
            BGXYBH_67 = ''
            TDYT_68 = ''
            SYNX_69 = ''
            MJ_70 = ''
            TDMJ_71 = ''
            ZRJG_72 = ''
            CRNX_73 = ''
            TDSYNX_74 = ''
            BZ_75 = ''
            GSQ_76 = ''
            LXDW_77 = ''
            DWDZ_78 = ''
            YZBM_79 = ''
            LXDH_80 = ''
            LXR_81 = ''
            DZYJ_82 = ''

            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_45 = response.meta.get('title')
            # 时间
            SJ_46 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_47 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 正文标题
            ZWBT_48 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 公示期
            GSQ_76 = reFunction(
                f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[。\s]', items)
            # 联系单位
            LXDW_77 = reFunction(
                '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_78 = reFunction(
                '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_79 = reFunction(
                '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_80 = reFunction(
                '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人
            LXR_81 = reFunction(
                '联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 电子邮件
            DZYJ_82 = reFunction(
                '电子邮件:([()\w\.:: —\(\)@〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_45 + SJ_46)

            soup = BeautifulSoup(
                response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if '国有划拨土地使用权结果公示' in items:
                table.find_all('tr')[1].extract()
                tdData = htmlTable.tableTrTdRegulationToList(table)
                for _ in range(len(list(tdData.values())[0])):
                    # 地块编号
                    DKBH_49 = tdData.get('地块编号')[_] if tdData.get(
                        '地块编号') else ''
                    # 公开转让方式
                    GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get(
                        '公开转让方式') else ''
                    # 挂牌时间
                    GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else ''
                    # 受让人
                    SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else ''
                    # 位置
                    WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else ''
                    # 出让面积(平方米)
                    CRMJ_61 = tdData.get('出让面积')[_] if tdData.get(
                        '出让面积') else ''
                    # 用途
                    YT_62 = tdData.get('用途')[_] if tdData.get('用途') else ''
                    # 成交价(万元)
                    CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else ''
                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if True:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_45,
                                SJ_46,
                                LY_47,
                                ZWBT_48,
                                DKBH_49,
                                ZDBH_50,
                                PMJG_51,
                                GGZRFS_52,
                                GPSJ_53,
                                ZRR_54,
                                ZRF_55,
                                SRR_56,
                                SRF_57,
                                SRDW_58,
                                WZ_59,
                                DKWZ_60,
                                CRMJ_61,
                                YT_62,
                                CJJ_63,
                                BDCQDJH_64,
                                CRHTBH_65,
                                CRHT_66,
                                BGXYBH_67,
                                TDYT_68,
                                SYNX_69,
                                MJ_70,
                                TDMJ_71,
                                ZRJG_72,
                                CRNX_73,
                                TDSYNX_74,
                                BZ_75,
                                GSQ_76,
                                LXDW_77,
                                DWDZ_78,
                                YZBM_79,
                                LXDH_80,
                                LXR_81,
                                DZYJ_82,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '不动产权登记证号' in items:
                # 转让方
                ZRF_55 = reFunction(
                    '转让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 受让方
                SRF_57 = reFunction(
                    '受让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地位置:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 不动产权登记证号
                BDCQDJH_64 = reFunction(
                    '不动产权登记证号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 出让合同编号
                CRHTBH_65 = reFunction(
                    '出让合同编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 变更协议编号
                BGXYBH_67 = reFunction(
                    '出让合同变更协议编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 使用年限
                SYNX_69 = reFunction(
                    '使用年限:\s*([()【】\w\.::—\(\)〔〕\s㎡≤≥《》\-\/\%,;,、\.﹪]*)面\s*积',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '面\s*积:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 转让价格(单价总价)
                ZRJG_72 = reFunction(
                    '转让价格:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、。\.﹪]*)\s',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '挂牌出让地块的基本情况和规划指标要求' in items:
                # 宗地编号
                ZDBH_50 = reFunction(
                    '宗地编号:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 挂牌时间
                GPSJ_53 = reFunction(
                    '挂牌时间为:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s',
                    items).replace('。', '')
                # 转让人
                ZRR_54 = reFunction(
                    '转让人为:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*),', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地坐落:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '宗地面积:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 出让年限
                CRNX_73 = reFunction(
                    '出让年限:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 备注
                BZ_75 = reFunction(
                    '备注:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s*二',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '地块基本情况' in items:
                try:
                    if '备注' not in items:
                        tdData = htmlTable.tableTrTdRegulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get(
                                '宗地编号') else ''
                            # 受让单位
                            SRDW_58 = tdData.get('受让单位')[_] if tdData.get(
                                '受让单位') else ''
                            # 受让人
                            SRR_56 = tdData.get('竞得人')[_] if tdData.get(
                                '竞得人') else ''
                            # 地块位置
                            DKWZ_60 = tdData.get('地块位置')[_] if tdData.get(
                                '地块位置') else ''
                            # 土地用途
                            TDYT_68 = tdData.get('土地用途')[_] if tdData.get(
                                '土地用途') else ''
                            # 成交价(万元)
                            CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get(
                                '成交价(万元)') else ''
                            # 土地面积(公顷)
                            TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get(
                                '土地面积(亩)') else ''
                            # 出让年限
                            CRNX_73 = tdData.get('出让年限')[_] if tdData.get(
                                '出让年限') else ''

                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)
                    else:
                        if '竞得人' not in items:
                            for item in [
                                    '宗地编号' + _
                                    for _ in re.findall('一([\s\S]*)二、', items)
                                [0].split('宗地编号')[1:]
                            ]:
                                # 宗地编号
                                ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                                # 受让单位
                                SRDW_58 = reFunction(
                                    '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 地块位置
                                DKWZ_60 = reFunction(
                                    '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 成交价(万元)
                                CJJ_63 = reFunction(
                                    '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item
                                ) if reFunction(
                                    '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item
                                ) else reFunction(
                                    '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 土地用途
                                TDYT_68 = reFunction(
                                    '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 土地面积(公顷)
                                TDMJ_71 = reFunction(
                                    '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 出让年限
                                CRNX_73 = reFunction(
                                    '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 备注
                                BZ_75 = reFunction(
                                    '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                    item)
                                if '二' in BZ_75:
                                    BZ_75 = ''
                                # 写入数据
                                if self.name in DUPLICATE_SWITCH_LIST:
                                    if self.redisClient.isExist(
                                            md5Mark):  # 存在, 去重计数
                                        self.duplicateUrl += 1

                                if self.duplicateUrl < 50:
                                    if True:
                                        # 重复效验通过, 存储数据
                                        csvFile = [
                                            WJBT_45,
                                            SJ_46,
                                            LY_47,
                                            ZWBT_48,
                                            DKBH_49,
                                            ZDBH_50,
                                            PMJG_51,
                                            GGZRFS_52,
                                            GPSJ_53,
                                            ZRR_54,
                                            ZRF_55,
                                            SRR_56,
                                            SRF_57,
                                            SRDW_58,
                                            WZ_59,
                                            DKWZ_60,
                                            CRMJ_61,
                                            YT_62,
                                            CJJ_63,
                                            BDCQDJH_64,
                                            CRHTBH_65,
                                            CRHT_66,
                                            BGXYBH_67,
                                            TDYT_68,
                                            SYNX_69,
                                            MJ_70,
                                            TDMJ_71,
                                            ZRJG_72,
                                            CRNX_73,
                                            TDSYNX_74,
                                            BZ_75,
                                            GSQ_76,
                                            LXDW_77,
                                            DWDZ_78,
                                            YZBM_79,
                                            LXDH_80,
                                            LXR_81,
                                            DZYJ_82,
                                            crawlingTime,
                                            url,
                                            md5Mark,
                                        ]
                                        results = ''
                                        for _ in csvFile:
                                            try:
                                                if _ and _ != '|' * len(_):
                                                    results += _.replace(
                                                        ',', ' '
                                                    ).replace('\n', '').replace(
                                                        '\t', '').replace(
                                                            '\r', '').replace(
                                                                r'\xa0',
                                                                '').replace(
                                                                    '\xa0',
                                                                    '') + ','
                                                else:
                                                    results += ','
                                            except Exception as e:
                                                results += ','
                                                self.log(
                                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                    level=logging.ERROR)
                                        with open(self.pathDetail, 'a+') as fp:
                                            fp.write(results)
                                            fp.write('\n')
                                        self.log(f'数据获取成功', level=logging.INFO)
                                        yield
                                else:
                                    self.crawler.engine.close_spider(
                                        self,
                                        'response msg info %s, job duplicated!'
                                        % response.url)
                except Exception as e:
                    if '竞得人' not in items:
                        for item in [
                                '宗地编号' + _ for _ in re.findall(
                                    '一([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                        ]:
                            # 宗地编号
                            ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                            # 受让单位
                            SRDW_58 = reFunction(
                                '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 地块位置
                            DKWZ_60 = reFunction(
                                '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 成交价(万元)
                            CJJ_63 = reFunction(
                                '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item
                            ) if reFunction(
                                '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item
                            ) else reFunction(
                                '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 土地用途
                            TDYT_68 = reFunction(
                                '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 土地面积(公顷)
                            TDMJ_71 = reFunction(
                                '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 出让年限
                            CRNX_73 = reFunction(
                                '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 备注
                            BZ_75 = reFunction(
                                '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                item)
                            if '二' in BZ_75:
                                BZ_75 = ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
Ejemplo n.º 15
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            BT_18 = ''
            LY_19 = ''
            SJ_20 = ''
            XZQ_21 = ''
            DZJGH_22 = ''
            XMMC_23 = ''
            XMWZ_24 = ''
            MJ_25 = ''
            TDLY_26 = ''
            TSYT_27 = ''
            GDFS_28 = ''
            TDSYNX_29 = ''
            HYFL_30 = ''
            TDJB_31 = ''
            CJJG_32 = ''
            ZFQH_33 = ''
            YDZFRQ_34 = ''
            YDZFJE_35 = ''
            BZ_36 = ''
            TDSTQR_37 = ''
            SX_38 = ''
            XX_39 = ''
            YDJDSJ_40 = ''
            YDKGSJ_41 = ''
            YDJGSJ_42 = ''
            SJKGSJ_43 = ''
            SJJGSJ_44 = ''
            PZDW_45 = ''
            HTQDRQ_46 = ''

            # TODO 共有字段
            # 标题
            BT_18 = response.meta.get('title')
            LY = data.xpath(
                '//div[@class="content-small-title"]/text()').extract_first()
            # 来源
            LY_19 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY)
            # 时间
            SJ_20 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)

            # 解析 table 若出错 使用正则
            htmlTable = htmlTableTransformer()
            if '宗地编号' not in items and '行政区' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find_all('table')[0]
                    if not table.tbody.find_all('tr')[0].find_all(
                            text=re.compile("用地单位|受让人")):
                        table.tbody.find_all('tr')[0].extract()
                    tdsData = htmlTable.tableTrTdRegulationToList(table)

                    for _ in range(len(list(tdsData.values())[0])):
                        # 项目位置
                        XMWZ_24 = tdsData.get('土地座落')[_] if tdsData.get(
                            '土地座落') else tdsData.get('宗地位置')[_] if tdsData.get(
                                '宗地位置') else ''
                        # 面积
                        MJ_25_0 = tdsData.get('出让面积(公顷)')
                        MJ_25_1 = tdsData.get('出让面积')
                        MJ_25_2 = tdsData.get('出让/划拨面积')
                        MJ_25_ = list(filter(None,
                                             [MJ_25_0, MJ_25_1, MJ_25_2]))
                        MJ_25 = MJ_25_[0][_] if MJ_25_ else ''
                        # 土地用途
                        TSYT_27 = tdsData.get('土地用途')[_] if tdsData.get(
                            '土地用途') else tdsData.get('用途明细')[_] if tdsData.get(
                                '用途明细') else ''
                        # 供地方式
                        GDFS_28 = tdsData.get('供应方式')[_] if tdsData.get(
                            '供应方式') else ''
                        # 土地级别
                        TDJB_31 = tdsData.get('土地级别')[_] if tdsData.get(
                            '土地级别') else ''
                        # 成交价格
                        CJJG_32_0 = tdsData.get('出让价款')
                        CJJG_32_1 = tdsData.get('出让价款(万元)')
                        CJJG_32_2 = tdsData.get('出让/划拨价歀')
                        CJJG_32_ = list(
                            filter(None, [CJJG_32_0, CJJG_32_1, CJJG_32_2]))
                        CJJG_32 = CJJG_32_[0][_] if CJJG_32_ else ''
                        # 土地使用权人
                        TDSTQR_37 = tdsData.get('用地单位')[_] if tdsData.get(
                            '用地单位') else tdsData.get('受让人')[_] if tdsData.get(
                                '受让人') else ''
                        # 合同签订日期
                        HTQDRQ_46 = tdsData.get('签订日期')[_] if tdsData.get(
                            '签订日期') else ''

                        # 爬取时间
                        crawlingTime = time.strftime("%Y-%m-%d",
                                                     time.localtime())
                        # 爬取地址url
                        url = response.url
                        # 唯一标识
                        md5Mark = encrypt_md5(url + LY_19 + SJ_20)

                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                BT_18,
                                LY_19,
                                SJ_20,
                                XZQ_21,
                                DZJGH_22,
                                XMMC_23,
                                XMWZ_24,
                                MJ_25,
                                TDLY_26,
                                TSYT_27,
                                GDFS_28,
                                TDSYNX_29,
                                HYFL_30,
                                TDJB_31,
                                CJJG_32,
                                ZFQH_33,
                                YDZFRQ_34,
                                YDZFJE_35,
                                BZ_36,
                                TDSTQR_37,
                                SX_38,
                                XX_39,
                                YDJDSJ_40,
                                YDKGSJ_41,
                                YDJGSJ_42,
                                SJKGSJ_43,
                                SJJGSJ_44,
                                PZDW_45,
                                HTQDRQ_46,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                except Exception as e:
                    pass
            else:
                # 进行正则匹配
                # 行政区
                XZQ_21 = reFunction(f'行政区:([{self.reStr}]*)电子监管号', items)
                # 电子监管号
                DZJGH_22 = reFunction(f'电子监管号:([{self.reStr}]*)项目名称', items)
                # 项目名称
                XMMC_23_ = reFunction(f'项目名称:([{self.reStr}]*)项目位置', items)
                XMMC_23 = XMMC_23_ if XMMC_23_ else reFunction(
                    f'宗地编号([{self.reStr}]*)地块位置', items)
                # 项目位置
                XMWZ_24_ = reFunction(f'项目位置:([{self.reStr}]*)面积(公顷):	', items)
                XMWZ_24 = XMWZ_24_ if XMWZ_24_ else reFunction(
                    f'地块位置([{self.reStr}]*)土地用途', items)
                # 面积
                MJ_25_ = reFunction(f'面积\(公顷\):([{self.reStr}]*)土地来源', items)
                MJ_25 = MJ_25_ if MJ_25_ else reFunction(
                    f'土地面积\(公顷\)([{self.reStr}]*)出让年限', items)
                # 土地来源
                TDLY_26 = reFunction(f'土地来源:([{self.reStr}]*)土地用途', items)
                # 土地用途
                TSYT_27_ = reFunction(f'土地用途:([{self.reStr}]*)供地方式', items)
                TSYT_27 = TSYT_27_ if TSYT_27_ else data.xpath(
                    'string(//table/tbody/tr[5]/td[1])').extract_first()
                # 供地方式
                GDFS_28 = reFunction(f'供地方式:([{self.reStr}]*)土地使用年限', items)
                # 土地使用年限
                TDSYNX_29_ = reFunction(f'土地使用年限:([{self.reStr}]*)行业分类', items)
                TDSYNX_29 = TDSYNX_29_ if TDSYNX_29_ else reFunction(
                    f'出让年限([{self.reStr}]*)成交价\(万元\)', items)
                # 行业分类
                HYFL_30 = reFunction(f'行业分类:([{self.reStr}]*)土地级别', items)
                # 土地级别
                TDJB_31 = reFunction(f'土地级别:([{self.reStr}]*)成交价格\(万元\)',
                                     items)
                # 成交价格
                CJJG_32_ = reFunction(f'成交价格\(万元\):([{self.reStr}]*)分期支付约定',
                                      items)
                CJJG_32 = CJJG_32_ if CJJG_32_ else reFunction(
                    f'成交价格\(万元\)([{self.reStr}]*)明细用途', items)
                # 分期支付约定—支付期号
                ZFQH_33 = data.xpath(
                    '//table/tbody/tr[10]/td[1]/text()').extract_first()
                # 分期支付约定—约定支付日期
                YDZFRQ_34 = data.xpath(
                    '//table/tbody/tr[10]/td[2]/text()').extract_first()
                # 分期支付约定—约定支付金额
                YDZFJE_35 = data.xpath(
                    '//table/tbody/tr[10]/td[3]/text()').extract_first()
                # 分期支付约定—备注
                BZ_36 = data.xpath(
                    'string(//table/tbody/tr[10]/td[4])').extract_first()
                # 土地使用权人
                TDSTQR_37_ = reFunction(f'土地使用权人:([{self.reStr}]*)约定容积率',
                                        items)
                TDSTQR_37 = TDSTQR_37_ if TDSTQR_37_ else reFunction(
                    f'受让单位([{self.reStr}]*)备注', items)
                # 约定容积率——下限
                SX_38 = reFunction(f'下限:([{self.reStr}]*)上限', items)
                # 约定容积率——上限
                XX_39 = reFunction(f'上限:([{self.reStr}]*)约定交地时间', items)
                # 约定交地时间
                YDJDSJ_40 = reFunction(f'约定交地时间:([{self.reStr}]*)约定开工时间',
                                       items)
                # 约定开工时间
                YDKGSJ_41 = reFunction(f'约定开工时间:([{self.reStr}]*)约定竣工时间',
                                       items)
                # 约定竣工时间
                YDJGSJ_42 = reFunction(f'约定竣工时间:([{self.reStr}]*)实际开工时间',
                                       items)
                # 实际开工时间
                SJKGSJ_43 = reFunction(f'实际开工时间:([{self.reStr}]*)实际竣工时间',
                                       items)
                # 实际竣工时间
                SJJGSJ_44 = reFunction(f'实际竣工时间:([{self.reStr}]*)批准单位', items)
                # 批准单位
                PZDW_45 = reFunction(f'批准单位:([{self.reStr}]*)合同签订日期', items)
                # 合同签订日期
                HTQDRQ_46 = reFunction(f'合同签订日期:([{self.reStr}]*)\s', items)

                crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + LY_19 + SJ_20)

                # 是否需要判断重复 请求
                if DUPLICATE_SWITCH:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    # 重复效验通过, 存储数据
                    csvFile = [
                        BT_18,
                        LY_19,
                        SJ_20,
                        XZQ_21,
                        DZJGH_22,
                        XMMC_23,
                        XMWZ_24,
                        MJ_25,
                        TDLY_26,
                        TSYT_27,
                        GDFS_28,
                        TDSYNX_29,
                        HYFL_30,
                        TDJB_31,
                        CJJG_32,
                        ZFQH_33,
                        YDZFRQ_34,
                        YDZFJE_35,
                        BZ_36,
                        TDSTQR_37,
                        SX_38,
                        XX_39,
                        YDJDSJ_40,
                        YDKGSJ_41,
                        YDJGSJ_42,
                        SJKGSJ_43,
                        SJJGSJ_44,
                        PZDW_45,
                        HTQDRQ_46,
                        crawlingTime,
                        url,
                        md5Mark,
                    ]
                    results = ''
                    for _ in csvFile:
                        try:
                            if _ and _ != '|' * len(_):
                                results += _.replace(',', ' ').replace(
                                    '\n', '').replace('\r', '').replace(
                                        r'\xa0', '').replace('\xa0', '') + ','
                            else:
                                results += ','
                        except Exception as e:
                            results += ','
                            self.log(
                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                level=logging.ERROR)
                    with open(self.pathDetail, 'a+') as fp:
                        fp.write(results)
                        fp.write('\n')
                    self.log(f'数据获取成功', level=logging.INFO)
                    yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)