コード例 #1
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            WZLY_2 = ''
            GXSJ_3 = ''
            ZDBH_4 = ''
            ZDZL_5 = ''
            MJ_6 = ''
            TDYT_7 = ''
            CRNX_8 = ''
            RJL_9 = ''
            LDL_10 = ''
            JZMD_11 = ''
            JZXG_12 = ''
            JMBZJ_13 = ''
            QSJ_14 = ''
            ZJFD_15 = ''
            CRR_16 = ''
            QTSM_17 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_1 = response.meta.get('title')
            # 文章来源
            WZLY_2 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                ).extract_first().replace('文章来自:', '')
            # 更新时间
            GXSJ_3 = data.xpath('//div[@class="news_time"]/span[2]/text()'
                                ).extract_first().replace('更新时间:', '')
            # 备注
            QTSM_17 = reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s', items)

            # TODO //table[@border="1"]   //table[@border="0"]
            # table 解析
            if '宗地编号' not in items and '配套建筑规划用地' not in items:
                if data.xpath(
                        '//table[@border="0"]') and '主要规划指标' not in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5 = tdData.get('土地位置')
                    # 面积
                    MJ_6 = tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('土地用途')
                    # 出让年限
                    CRNX_8 = tdData.get('出让年限(年)') if tdData.get(
                        '出让年限(年)') else tdData.get('出让年限')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10 = tdData.get('绿地率') if tdData.get(
                        '绿地率') else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11 = tdData.get('建筑密度')
                    # 建筑限高
                    JZXG_12 = tdData.get('建筑高度')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(万元)') if tdData.get(
                        '竞买保证金(万元)') else tdData.get('竞买保证金(元)')
                    # 起始价
                    QSJ_14 = tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')
                if '规划指标要求' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace_ = table.tbody.find_all('tr')[0].find(
                        'td', colspan='4')
                    tdReplace = tdReplace_ if tdReplace_ else table.tbody.find_all(
                        'tr')[0].find('td', colspan='3')
                    try:
                        number = table.tbody.find_all('tr')[0].index(tdReplace)
                        tdList = table.tbody.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.tbody.find_all('tr')[0].insert(
                                number + _, tdList[_ - 1])
                        tdReplace.extract()
                        table.tbody.find_all('tr')[1].extract()
                    except:
                        pass
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5_ = tdData.get('土地位置') if tdData.get(
                        '土地位置') else tdData.get('地块位置/名称')
                    ZDZL_5 = ZDZL_5_.replace(
                        reFunction(f'备注(?:[\s]*)([{self.reStr}]*)\s',
                                   reFunction('一([\s\S]*)二', items)), '')
                    # 面积
                    MJ_6 = tdData.get('土地面积(m2)') if tdData.get(
                        '土地面积(m2)') else tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('土地用途') if tdData.get(
                        '土地用途') else tdData.get('规划地性质')
                    # 出让年限
                    CRNX_8_ = tdData.get(r'出让\u3000年限') if tdData.get(
                        r'出让\u3000年限') else tdData.get('出让年限')
                    CRNX_8 = CRNX_8_ if CRNX_8_ else tdData.get('出让年限(年)')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10_ = tdData.get('绿地率') if tdData.get(
                        '绿地率') else tdData.get('绿地率(%)')
                    LDL_10 = LDL_10_ if LDL_10_ else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11_ = tdData.get('建筑\u3000密度') if tdData.get(
                        '建筑\u3000密度') else tdData.get('建筑密度')
                    JZMD_11__ = JZMD_11_ if JZMD_11_ else tdData.get('建筑密度(%)')
                    JZMD_11 = JZMD_11__ if JZMD_11__ else tdData.get(
                        '建筑\u3000密度(不大于)')
                    # 建筑限高
                    JZXG_12_ = tdData.get('建筑限高') if tdData.get(
                        '建筑限高') else tdData.get('建筑高度(m)')
                    JZXG_12__ = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度')
                    JZXG_12 = JZXG_12__ if JZXG_12__ else tdData.get(
                        '建筑限高(不高于)')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get(
                        '竞买保证金(元)') else tdData.get('竞买保证金(万元)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('挂牌出让起始价(元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')
                    if ZJFD_15 == '' and QSJ_14 == '' and JMBZJ_13 == '':
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace0 = table.tbody.find_all('tr')[0].find_all(
                            'td')[-1]  # 第一个
                        tdReplace1 = table.tbody.find_all('tr')[1].find_all(
                            'td')[-1]  # 第二个
                        number0 = table.tbody.find_all('tr')[0].index(
                            tdReplace0)  # 第一个index
                        number1 = table.tbody.find_all('tr')[1].index(
                            tdReplace1)  # 第二个index
                        tdList2 = table.tbody.find_all('tr')[2].find_all(
                            'td')  # 第二个
                        tdList3 = table.tbody.find_all('tr')[3].find_all(
                            'td')  # 第四个
                        for _ in range(1, len(tdList2) + 1):
                            table.tbody.find_all('tr')[0].insert(
                                number0 + _, tdList2[_ - 1])
                        for _ in range(1, len(tdList3) + 1):
                            table.tbody.find_all('tr')[1].insert(
                                number1 + _, tdList3[_ - 1])
                        table.tbody.find_all('tr')[2].extract()

                        htmlTable = htmlTableTransformer()
                        tdDataCopy = htmlTable.tableTrTdRegulation(table)
                        # 竞买保证金
                        JMBZJ_13 = tdDataCopy.get(
                            '竞买保证金(元)') if tdDataCopy.get(
                                '竞买保证金(元)') else tdDataCopy.get('竞买保证金(万元)')
                        # 起始价
                        QSJ_14_ = tdDataCopy.get('起始价(元)') if tdDataCopy.get(
                            '起始价(元)') else tdDataCopy.get('挂牌出让起始价(元)')
                        QSJ_14 = QSJ_14_ if QSJ_14_ else tdDataCopy.get(
                            '起始价(万元)')
                        # 增价幅度
                        ZJFD_15 = tdDataCopy.get('增价幅度(万元)') if tdDataCopy.get(
                            '增价幅度(万元)') else tdDataCopy.get('加价幅度')
                    # 出让人
                if '标的序号' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table', border='0')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.table_tr_td(table)
                    # 宗地坐落
                    ZDZL_5 = tdData.get('标的位置')
                    # 面积
                    MJ_6 = tdData.get('土地面积') if tdData.get(
                        '土地面积') else tdData.get('土地面积(平方米)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('拍卖参考价(万元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 出让年限
                    CRNX_8 = tdData.get('土地性质(年限)') if tdData.get(
                        '土地性质(年限)') else tdData.get('出让年限(年)')
            else:
                if '宗地编号' in items:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二', items)[0].split('宗地编号')[1:]
                    ]:
                        # 宗地编号
                        ZDBH_4 += '|' + reFunction(
                            f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 宗地坐落
                        ZDZL_5 += '|' + reFunction(
                            f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 面积
                        MJ_6 += '|' + reFunction(
                            f'宗地面积:(?:[\s]*)([{self.reStr}]*)\s', item)

                        # 出让年限
                        CRNX_8 += '|' + reFunction(
                            f'出让年限:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 容积率
                        RJL_9 += '|' + reFunction(
                            f'容积率:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 绿地率
                        LDL_10 += '|' + reFunction(
                            f'绿地率\(%\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 建筑密度
                        JZMD_11 += '|' + reFunction(
                            f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 建筑限高
                        JZXG_12 += '|' + reFunction(
                            f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 竞买保证金
                        JMBZJ_13 += '|' + reFunction(
                            f'保证金:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 起始价
                        QSJ_14 += '|' + reFunction(
                            f'起始价:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 增价幅度
                        ZJFD_15 += '|' + reFunction(
                            f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 出让人
                        # CRR_16 += '|' +  reFunction(f'宗地编号:(?:[\s]*)([{self.reStr}]*)\s', item)
                        # 其他说明
                        QTSM_17 += '|' + reFunction(
                            f'备注:(?:[\s]*)([{self.reStr}]*)\s', item)
                if '配套建筑规划用地' in items:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace0 = table.tbody.find_all('tr')[0].find_all('td')[
                        -1]  # 第一个
                    tdReplace1 = table.tbody.find_all('tr')[1].find_all('td')[
                        -1]  # 第二个
                    number0 = table.tbody.find_all('tr')[0].index(
                        tdReplace0)  # 第一个index
                    number1 = table.tbody.find_all('tr')[1].index(
                        tdReplace1)  # 第二个index
                    tdList2 = table.tbody.find_all('tr')[2].find_all(
                        'td')  # 第二个
                    tdList3 = table.tbody.find_all('tr')[3].find_all(
                        'td')  # 第四个
                    for _ in range(1, len(tdList2) + 1):
                        table.tbody.find_all('tr')[0].insert(
                            number0 + _, tdList2[_ - 1])
                    for _ in range(1, len(tdList3) + 1):
                        table.tbody.find_all('tr')[1].insert(
                            number1 + _, tdList3[_ - 1])
                    table.tbody.find_all('tr')[2].extract()
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号
                    ZDBH_4 = tdData.get('地块编号')
                    # 宗地坐落
                    ZDZL_5 = tdData.get('地块位置/名称')
                    # 面积
                    MJ_6 = tdData.get('配套设施出让面积(m2)') if tdData.get(
                        '配套设施出让面积(m2)') else tdData.get('土地面积(平方米)')
                    # 土地用途
                    TDYT_7 = tdData.get('配套建筑规划用地性质')
                    # 出让年限
                    CRNX_8 = tdData.get('出让年限') if tdData.get(
                        '出让年限') else tdData.get('出让年限(年)')
                    # 容积率
                    RJL_9 = tdData.get('容积率') if tdData.get(
                        '容积率') else tdData.get('容积率(不大于)')
                    # 绿地率
                    LDL_10 = tdData.get('公园整体绿地率(%)') if tdData.get(
                        '公园整体绿地率(%)') else tdData.get('绿地率(不小于)')
                    # 建筑密度
                    JZMD_11 = tdData.get('公园整体建筑密度(%)')
                    # 建筑限高
                    JZXG_12_ = tdData.get('建筑限高') if tdData.get(
                        '建筑限高') else tdData.get('建筑高度(m)')
                    JZXG_12 = JZXG_12_ if JZXG_12_ else tdData.get('建筑高度')
                    # 竞买保证金
                    JMBZJ_13 = tdData.get('竞买保证金(元)') if tdData.get(
                        '竞买保证金(元)') else tdData.get('竞买保证金(万元)')
                    # 起始价
                    QSJ_14_ = tdData.get('起始价(元)') if tdData.get(
                        '起始价(元)') else tdData.get('配套设施用地挂牌出让起始价(元)')
                    QSJ_14 = QSJ_14_ if QSJ_14_ else tdData.get('起始价(万元)')
                    # 增价幅度
                    ZJFD_15 = tdData.get('增价幅度(万元)') if tdData.get(
                        '增价幅度(万元)') else tdData.get('加价幅度')

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                WZLY_2,
                GXSJ_3,
                ZDBH_4,
                ZDZL_5,
                MJ_6,
                TDYT_7,
                CRNX_8,
                RJL_9,
                LDL_10,
                JZMD_11,
                JZXG_12,
                JMBZJ_13,
                QSJ_14,
                ZJFD_15,
                QTSM_17,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #2
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            '''data.xpath("string(path)")
            path -- xpath提取的路径  这里提取到父标签
           '''
            # TODO 共有字段
            # 标题
            BT_10 = response.meta.get('title')
            LY = data.xpath(
                '//div[@class="content-small-title"]/text()').extract_first()
            # 来源
            LY_11 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY)
            # 时间
            SJ_12 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 编号
            BH_13 = ''.join(
                data.xpath("string(//table[1]/tbody/tr[2]/td[1])").extract())
            # 土地位置
            TDWZ_14 = ''.join(
                data.xpath("string(//table[1]/tbody/tr[2]/td[2])").extract())
            # 使用权面积
            SYQMJ_15 = ''.join(
                data.xpath("string(//table[1]/tbody/tr[2]/td[3])").extract())
            # TODO 规划用地性质
            GHYDXZ_16 = ''.join(
                data.xpath("string(//table[1]/tbody/tr[2]/td[4])").extract())
            # 出让年限
            CRNX_17 = ''.join(
                data.xpath("string(//table[1]/tbody/tr[2]/td[5])").extract())
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + BT_10 + SJ_12)

            # 是否需要判断重复 请求
            if DUPLICATE_SWITCH:
                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                    self.duplicateUrl += 1

            if self.duplicateUrl < 50:
                # 重复效验通过, 存储数据
                csvFile = [
                    BT_10,
                    LY_11,
                    SJ_12,
                    BH_13,
                    TDWZ_14,
                    SYQMJ_15,
                    GHYDXZ_16,
                    CRNX_17,
                    crawlingTime,
                    url,
                    md5Mark,
                ]
                results = ''
                for _ in csvFile:
                    try:
                        if _ and _ != '|' * len(_):
                            results += _.replace(',', ' ').replace(
                                '\n', '').replace('\r', '').replace(
                                    r'\xa0', '').replace('\xa0', '') + ','
                        else:
                            results += ','
                    except Exception as e:
                        results += ','
                        self.log(
                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                            level=logging.ERROR)
                with open(self.pathDetail, 'a+') as fp:
                    fp.write(results)
                    fp.write('\n')
                self.log(f'数据获取成功', level=logging.INFO)
                yield
            else:
                self.crawler.engine.close_spider(
                    self,
                    'response msg info %s, job duplicated!' % response.url)
        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #3
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            categorynum = response.meta.get('categorynum')
            infoid = response.meta.get('infoid')
            targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum
            results = ''
            for _ in range(5):
                try:
                    self.session.get(targetUrl,
                                     headers=self.header,
                                     allow_redirects=False,
                                     timeout=60)
                    redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect'
                    data = {'categorynum': categorynum, 'infoid': infoid}
                    response_ = self.session.post(redirectUrl,
                                                  headers=self.header,
                                                  data=data,
                                                  allow_redirects=False,
                                                  timeout=60)
                    url = 'https://www.cqggzy.com' + response_.json().get(
                        'custom') if 'http' not in response_.json().get(
                            'custom') else response_.json().get('custom')
                    results = self.session.get(url,
                                               headers=self.header,
                                               allow_redirects=False,
                                               timeout=60)
                    break
                except Exception as e:
                    pass

            data = Selector(text=results.content.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            XXSJ_2 = ''
            TDWZ_3 = ''
            YT_4 = ''
            TDMJ_5 = ''
            ZJRJZMJ_6 = ''
            ZDJZMD_7 = ''
            LDL_9 = ''
            CRJKQSJ_8 = ''
            JMBZJ_11 = ''
            BH_12 = ''
            CYNB_13 = ''
            KJMJ_14 = ''
            TZQD_15 = ''
            CCYQ_16 = ''
            BZ_17 = ''
            HQCRWJSJ_18 = ''
            HQCRWJDD_19 = ''
            BMSJ_20 = ''
            BMDD_21 = ''
            BZJJZSJ_22 = ''
            QRJMZGSJ_23 = ''
            LXDZ_24 = ''
            LXDH_25 = ''
            LXR_26 = ''

            # 共有字段
            # 文件标题
            WJBT_1 = data.xpath(
                '//*[@class="article-title"]/text()').extract_first()
            # 信息时间
            XXSJ_2 = reFunction(
                '(\d{4}-\d{1,2}-\d{1,2})',
                data.xpath(
                    '//*[@class="info-source"]/text()[1]').extract_first())
            if (('总计容建筑面' in items and '序号' in items)
                    or data.xpath('//table')) and '宗地编号' not in items:
                # TODO
                soup = BeautifulSoup(results.content.decode('utf-8'))
                tableMso = soup.find('table', 'MsoTableGrid')
                table = soup.find('table')
                htmlTable = htmlTableTransformer()
                try:
                    if tableMso:
                        tdData = htmlTable.table_tr_td(table)
                    else:
                        tdData = htmlTable.tableTrTdRegulation(table)
                    sourceTdData = tdData
                    for key, value in tdData.items():
                        tdData[key] = value.replace(str(key),
                                                    '') if value else value
                    # 土地位置   //table[@class="MsoNormalTable"]
                    TDWZ_3 = tdData.get('土地位置')
                    # 用途
                    YT_4 = tdData.get('土地用途') if tdData.get(
                        '土地用途') else tdData.get('用途')
                    # 土地面积(m)
                    TDMJ_5 = tdData.get('土地面积(m)') if tdData.get(
                        '土地面积 (m)') else tdData.get('土地面积 (㎡)')
                    # 总计容建筑面积(m2)
                    ZJRJZMJ_6 = tdData.get('总计容建筑面积(㎡)')
                    # 最大建筑密度
                    ZDJZMD_7 = tdData.get('最大建筑密度')
                    # 绿地率
                    LDL_9 = tdData.get('绿地率')
                    # TODO 正则匹配
                    if not ZDJZMD_7 and not LDL_9:
                        # sourceTdData
                        for value in sourceTdData.values():
                            if '最大建筑密度' in value:
                                ZDJZMD_7 = value.replace('最大建筑密度', '')
                            if '绿地率' in value:
                                LDL_9_ = value.replace('绿地率', '')
                                LDL_9 = LDL_9_ if len(
                                    LDL_9_
                                ) < 10 else reFunction(
                                    f'绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                                    value)
                            if '总计容建筑面积' in value:
                                LDL_9 = value.replace('总计容建筑面积(㎡)', '')
                    # 出让价款起始价(万元)
                    CRJKQSJ_8 = tdData.get('出让价款起始价(万元)')
                    # 投标竞买保证金(万元)  保证金(万元)
                    JMBZJ_11 = tdData.get('保证金(万元)') if tdData.get(
                        '保证金(万元)') else tdData.get('投标、竞买保证金(万元)')
                    # 编号
                    BH_12 = tdData.get('编号')
                    # 产业类别
                    CYNB_13 = tdData.get('产业类别')
                    # 可建面积(m2)或容积率
                    KJMJ_14 = tdData.get('可建面积(㎡)或容积率')
                    # 投资强度(万元 / 公顷)
                    TZQD_15 = tdData.get('投资强度(万元/公顷)')
                    # 产出要求(万元 / 公顷)
                    CCYQ_16 = tdData.get('产出要求(万元/公顷)')
                    # 备注  其他需要说明的宗地情况:
                    BZ_17_ = tdData.get('序号').split(
                        '备注:')[-1] if '备注' in tdData.get('序号') else tdData.get(
                            '备注:')
                    other = tdData.get('序号').split(
                        '其他需要说明的宗地情况:')[-1] if '其他需要说明的宗地情况:' in tdData.get(
                            '序号') else tdData.get('其他需要说明的宗地情况:')
                    BZ_17 = other if not BZ_17_ else BZ_17_
                    # 获取出让文件时间
                    HQCRWJSJ_18 = reFunction(
                        '竞买申请人可在([\w :\.\-\s\/\%,、]*)。',
                        reFunction('二、([\s\S]*)三、', items))
                    # 获取出让文件地点
                    HQCRWJDD_19 = reFunction(
                        '网址:([\w :\.\-\s\/\%,、]*)(?:[\)\s]*)',
                        reFunction('二、([\s\S]*)三、', items))
                    # 报名时间
                    BMSJ_20 = reFunction(
                        '竞买申请人可在([\w \.:\-\s\/\%,、]*)\(报名时间\)',
                        reFunction('三、([\s\S]*)四、', items))
                    # 保证金截止时间
                    BZJJZSJ_22 = reFunction(
                        '竞买保证金到账截止时间为([\w \.:\-\s\/\%,、]*)。',
                        reFunction('三、([\s\S]*)四、', items))
                    # 确认竞买资格时间
                    QRJMZGSJ_23 = BZJJZSJ_22
                    # 联系地址
                    LXDZ_24 = '|'.join(
                        re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                                   reFunction('七、([\s\S]*)', items)))
                    # 联系电话
                    LXDH_25 = '|'.join(
                        re.findall(
                            '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                            reFunction('七、([\s\S]*)', items)))
                    # 联系人
                    LXR_26 = '|'.join(
                        re.findall('联系人[::]([\w 、\.:\-\/\%,、()]*)(?:[ ,]*)',
                                   reFunction('七、([\s\S]*)', items)))
                except:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一、([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                    ]:
                        # 土地位置
                        TDWZ_3 += '|' + reFunction(
                            '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 用途
                        YT_4_1 = reFunction(
                            '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item)
                        YT_4_2 = reFunction(
                            '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)',
                            item)
                        YT_4 += '|' + YT_4_1 + YT_4_2
                        # 土地面积(m)
                        TDMJ_5 += '|' + reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) if reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) else '|' + reFunction(
                            '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item)
                        # 最大建筑密度
                        ZDJZMD_7 += '|' + reFunction(
                            '建筑密度\(%\):([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 绿地率
                        LDL_9 += '|' + reFunction(
                            '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item)
                        # 编号
                        BH_12 += '|' + reFunction(
                            '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 投资强度(万元 / 公顷)
                        TZQD_15 += '|' + reFunction(
                            '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                        # 备注
                        BZ_17 += '|' + reFunction('备注:([\s\S]*)', item)
                    # TODO 获取出让文件时间
                    HQCRWJSJ_18 = reFunction(
                        '申请人可于([\w :\.\-\s\/\%,、]*)到',
                        reFunction('四、([\s\S]*)五、', items))
                    # 获取出让文件地点
                    HQCRWJDD_19 = reFunction(
                        '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取',
                        reFunction('四、([\s\S]*)五、', items))
                    # 报名时间
                    BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到',
                                         reFunction('五、([\s\S]*)六、', items))
                    # 保证金截止时间
                    BZJJZSJ_22 = reFunction(
                        '竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。',
                        reFunction('五、([\s\S]*)六、', items))
                    # 确认竞买资格时间
                    QRJMZGSJ_23 = BZJJZSJ_22
                    # 联系地址
                    LXDZ_24 = '|'.join(
                        re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                                   reFunction('八|七、([\s\S]*)', items)))
                    # 联系电话
                    LXDH_25 = '|'.join(
                        re.findall(
                            '[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                            reFunction('八|七、([\s\S]*)', items)))
                    # 联系人
                    LXR_26 = '|'.join(
                        re.findall('联 系 人[::]([ \w]*)(?:[\n]*)',
                                   reFunction('八|七、([\s\S]*)', items)))
            else:
                for item in [
                        '宗地编号' + _ for _ in re.findall('一、([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 土地位置
                    TDWZ_3 += '|' + reFunction(
                        '宗地坐落:([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 用途
                    YT_4_1 = reFunction(
                        '主要用途:(?:[\s]*)([\w :\.\- \/\%,、]*)(?:\s)', item)
                    YT_4_2 = reFunction(
                        '土地用途[:](?:[\s]*)([\w ::\.\- \/\%,、]*)(?:\s)', item)
                    YT_4 += '|' + YT_4_1 + YT_4_2
                    # 土地面积(m)
                    TDMJ_5 += '|' + reFunction(
                        '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)',
                        item) if reFunction(
                            '宗地总面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item
                        ) else '|' + reFunction(
                            '宗地面积:(?:[\s]*)([\w :\.\- \/\%,、㎡]*)(?:\s)', item)
                    # 最大建筑密度
                    ZDJZMD_7 += '|' + reFunction(
                        '建筑密度:([\w :\.\-\s\/\%,、≦;≥]*)(?:\s)', item)
                    # 绿地率
                    LDL_9 += '|' + reFunction(
                        '绿地率\(%\)[:]([\w :\.\-\s\/\%,、≤;≥]*)(?:\s)', item)
                    # 编号
                    BH_12 += '|' + reFunction(
                        '宗地编号[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 投资强度(万元 / 公顷)
                    TZQD_15 += '|' + reFunction(
                        '投资强度[:]([\w :\.\-\s\/\%,、]*)(?:\s)', item)
                    # 备注
                    BZ_17 += '|' + reFunction('备注:([\s\S]*)', item)
                # TODO 获取出让文件时间
                HQCRWJSJ_18 = reFunction('申请人可于([\w :\.\-\s\/\%,、]*)到',
                                         reFunction('四、([\s\S]*)五、', items))
                # 获取出让文件地点
                HQCRWJDD_19 = reFunction(
                    '申请人可于(?:[\w :\.\-\s\/\%,、]*)到([\w :\.\-\s\/\%,、]*)获取',
                    reFunction('四、([\s\S]*)五、', items))
                # 报名时间
                BMSJ_20 = reFunction('申请人可于([\w \.:\-\s\/\%,、]*)到',
                                     reFunction('五、([\s\S]*)六、', items))
                # 保证金截止时间
                BZJJZSJ_22 = reFunction('竞买保证金的截止时间为([\w \d\.:\-\s\/\%,、 ]*)。',
                                        reFunction('五、([\s\S]*)六、', items))
                # 确认竞买资格时间
                QRJMZGSJ_23 = BZJJZSJ_22
                # 联系地址
                LXDZ_24 = '|'.join(
                    re.findall('联系地址:([\w 、\.:\-\/\%,、()]*)(?:[,\n])',
                               reFunction('八|七、([\s\S]*)', items)))
                # 联系电话
                LXDH_25 = '|'.join(
                    re.findall('[联系]*电话[::]([\w 、\.:\-\/\%,、()]*)(?:[\n。])',
                               reFunction('八|七、([\s\S]*)', items)))
                # 联系人
                LXR_26 = '|'.join(
                    re.findall('联 系 人[::]([ \w]*)(?:[\n]*)',
                               reFunction('八|七、([\s\S]*)', items)))
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = url if url else response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                XXSJ_2,
                TDWZ_3,
                YT_4,
                TDMJ_5,
                ZJRJZMJ_6,
                ZDJZMD_7,
                LDL_9,
                CRJKQSJ_8,
                JMBZJ_11,
                BH_12,
                CYNB_13,
                KJMJ_14,
                TZQD_15,
                CCYQ_16,
                BZ_17,
                HQCRWJSJ_18,
                HQCRWJDD_19,
                BMSJ_20,
                BMDD_21,
                BZJJZSJ_22,
                QRJMZGSJ_23,
                LXDZ_24,
                LXDH_25,
                LXR_26,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            yield
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #4
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_45 = ''
            SJ_46 = ''
            LY_47 = ''
            ZWBT_48 = ''
            DKBH_49 = ''
            ZDBH_50 = ''
            PMJG_51 = ''
            GGZRFS_52 = ''
            GPSJ_53 = ''
            ZRR_54 = ''
            ZRF_55 = ''
            SRR_56 = ''
            SRF_57 = ''
            SRDW_58 = ''
            WZ_59 = ''
            DKWZ_60 = ''
            CRMJ_61 = ''
            YT_62 = ''
            CJJ_63 = ''
            BDCQDJH_64 = ''
            CRHTBH_65 = ''
            CRHT_66 = ''
            BGXYBH_67 = ''
            TDYT_68 = ''
            SYNX_69 = ''
            MJ_70 = ''
            TDMJ_71 = ''
            ZRJG_72 = ''
            CRNX_73 = ''
            TDSYNX_74 = ''
            BZ_75 = ''
            GSQ_76 = ''
            LXDW_77 = ''
            DWDZ_78 = ''
            YZBM_79 = ''
            LXDH_80 = ''
            LXR_81 = ''
            DZYJ_82 = ''

            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_45 = response.meta.get('title')
            # 时间
            SJ_46 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_47 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 正文标题
            ZWBT_48 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 公示期
            GSQ_76 = reFunction(
                f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[。\s]', items)
            # 联系单位
            LXDW_77 = reFunction(
                '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_78 = reFunction(
                '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_79 = reFunction(
                '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_80 = reFunction(
                '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人
            LXR_81 = reFunction(
                '联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 电子邮件
            DZYJ_82 = reFunction(
                '电子邮件:([()\w\.:: —\(\)@〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_45 + SJ_46)

            soup = BeautifulSoup(
                response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if '国有划拨土地使用权结果公示' in items:
                table.find_all('tr')[1].extract()
                tdData = htmlTable.tableTrTdRegulationToList(table)
                for _ in range(len(list(tdData.values())[0])):
                    # 地块编号
                    DKBH_49 = tdData.get('地块编号')[_] if tdData.get(
                        '地块编号') else ''
                    # 公开转让方式
                    GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get(
                        '公开转让方式') else ''
                    # 挂牌时间
                    GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else ''
                    # 受让人
                    SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else ''
                    # 位置
                    WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else ''
                    # 出让面积(平方米)
                    CRMJ_61 = tdData.get('出让面积')[_] if tdData.get(
                        '出让面积') else ''
                    # 用途
                    YT_62 = tdData.get('用途')[_] if tdData.get('用途') else ''
                    # 成交价(万元)
                    CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else ''
                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if True:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_45,
                                SJ_46,
                                LY_47,
                                ZWBT_48,
                                DKBH_49,
                                ZDBH_50,
                                PMJG_51,
                                GGZRFS_52,
                                GPSJ_53,
                                ZRR_54,
                                ZRF_55,
                                SRR_56,
                                SRF_57,
                                SRDW_58,
                                WZ_59,
                                DKWZ_60,
                                CRMJ_61,
                                YT_62,
                                CJJ_63,
                                BDCQDJH_64,
                                CRHTBH_65,
                                CRHT_66,
                                BGXYBH_67,
                                TDYT_68,
                                SYNX_69,
                                MJ_70,
                                TDMJ_71,
                                ZRJG_72,
                                CRNX_73,
                                TDSYNX_74,
                                BZ_75,
                                GSQ_76,
                                LXDW_77,
                                DWDZ_78,
                                YZBM_79,
                                LXDH_80,
                                LXR_81,
                                DZYJ_82,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '不动产权登记证号' in items:
                # 转让方
                ZRF_55 = reFunction(
                    '转让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 受让方
                SRF_57 = reFunction(
                    '受让方:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地位置:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 不动产权登记证号
                BDCQDJH_64 = reFunction(
                    '不动产权登记证号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 出让合同编号
                CRHTBH_65 = reFunction(
                    '出让合同编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 变更协议编号
                BGXYBH_67 = reFunction(
                    '出让合同变更协议编号:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 使用年限
                SYNX_69 = reFunction(
                    '使用年限:\s*([()【】\w\.::—\(\)〔〕\s㎡≤≥《》\-\/\%,;,、\.﹪]*)面\s*积',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '面\s*积:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 转让价格(单价总价)
                ZRJG_72 = reFunction(
                    '转让价格:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、。\.﹪]*)\s',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '挂牌出让地块的基本情况和规划指标要求' in items:
                # 宗地编号
                ZDBH_50 = reFunction(
                    '宗地编号:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 挂牌时间
                GPSJ_53 = reFunction(
                    '挂牌时间为:\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s',
                    items).replace('。', '')
                # 转让人
                ZRR_54 = reFunction(
                    '转让人为:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*),', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地坐落:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '宗地面积:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 出让年限
                CRNX_73 = reFunction(
                    '出让年限:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 备注
                BZ_75 = reFunction(
                    '备注:*\s*([()【】\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;。,、\.﹪]*)\s*二',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '地块基本情况' in items:
                try:
                    if '备注' not in items:
                        tdData = htmlTable.tableTrTdRegulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get(
                                '宗地编号') else ''
                            # 受让单位
                            SRDW_58 = tdData.get('受让单位')[_] if tdData.get(
                                '受让单位') else ''
                            # 受让人
                            SRR_56 = tdData.get('竞得人')[_] if tdData.get(
                                '竞得人') else ''
                            # 地块位置
                            DKWZ_60 = tdData.get('地块位置')[_] if tdData.get(
                                '地块位置') else ''
                            # 土地用途
                            TDYT_68 = tdData.get('土地用途')[_] if tdData.get(
                                '土地用途') else ''
                            # 成交价(万元)
                            CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get(
                                '成交价(万元)') else ''
                            # 土地面积(公顷)
                            TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get(
                                '土地面积(亩)') else ''
                            # 出让年限
                            CRNX_73 = tdData.get('出让年限')[_] if tdData.get(
                                '出让年限') else ''

                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)
                    else:
                        if '竞得人' not in items:
                            for item in [
                                    '宗地编号' + _
                                    for _ in re.findall('一([\s\S]*)二、', items)
                                [0].split('宗地编号')[1:]
                            ]:
                                # 宗地编号
                                ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                                # 受让单位
                                SRDW_58 = reFunction(
                                    '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 地块位置
                                DKWZ_60 = reFunction(
                                    '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 成交价(万元)
                                CJJ_63 = reFunction(
                                    '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item
                                ) if reFunction(
                                    '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item
                                ) else reFunction(
                                    '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 土地用途
                                TDYT_68 = reFunction(
                                    '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 土地面积(公顷)
                                TDMJ_71 = reFunction(
                                    '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 出让年限
                                CRNX_73 = reFunction(
                                    '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                    item)
                                # 备注
                                BZ_75 = reFunction(
                                    '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                    item)
                                if '二' in BZ_75:
                                    BZ_75 = ''
                                # 写入数据
                                if self.name in DUPLICATE_SWITCH_LIST:
                                    if self.redisClient.isExist(
                                            md5Mark):  # 存在, 去重计数
                                        self.duplicateUrl += 1

                                if self.duplicateUrl < 50:
                                    if True:
                                        # 重复效验通过, 存储数据
                                        csvFile = [
                                            WJBT_45,
                                            SJ_46,
                                            LY_47,
                                            ZWBT_48,
                                            DKBH_49,
                                            ZDBH_50,
                                            PMJG_51,
                                            GGZRFS_52,
                                            GPSJ_53,
                                            ZRR_54,
                                            ZRF_55,
                                            SRR_56,
                                            SRF_57,
                                            SRDW_58,
                                            WZ_59,
                                            DKWZ_60,
                                            CRMJ_61,
                                            YT_62,
                                            CJJ_63,
                                            BDCQDJH_64,
                                            CRHTBH_65,
                                            CRHT_66,
                                            BGXYBH_67,
                                            TDYT_68,
                                            SYNX_69,
                                            MJ_70,
                                            TDMJ_71,
                                            ZRJG_72,
                                            CRNX_73,
                                            TDSYNX_74,
                                            BZ_75,
                                            GSQ_76,
                                            LXDW_77,
                                            DWDZ_78,
                                            YZBM_79,
                                            LXDH_80,
                                            LXR_81,
                                            DZYJ_82,
                                            crawlingTime,
                                            url,
                                            md5Mark,
                                        ]
                                        results = ''
                                        for _ in csvFile:
                                            try:
                                                if _ and _ != '|' * len(_):
                                                    results += _.replace(
                                                        ',', ' '
                                                    ).replace('\n', '').replace(
                                                        '\t', '').replace(
                                                            '\r', '').replace(
                                                                r'\xa0',
                                                                '').replace(
                                                                    '\xa0',
                                                                    '') + ','
                                                else:
                                                    results += ','
                                            except Exception as e:
                                                results += ','
                                                self.log(
                                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                    level=logging.ERROR)
                                        with open(self.pathDetail, 'a+') as fp:
                                            fp.write(results)
                                            fp.write('\n')
                                        self.log(f'数据获取成功', level=logging.INFO)
                                        yield
                                else:
                                    self.crawler.engine.close_spider(
                                        self,
                                        'response msg info %s, job duplicated!'
                                        % response.url)
                except Exception as e:
                    if '竞得人' not in items:
                        for item in [
                                '宗地编号' + _ for _ in re.findall(
                                    '一([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                        ]:
                            # 宗地编号
                            ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                            # 受让单位
                            SRDW_58 = reFunction(
                                '受让单位\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 地块位置
                            DKWZ_60 = reFunction(
                                '地块位置\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 成交价(万元)
                            CJJ_63 = reFunction(
                                '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item
                            ) if reFunction(
                                '成交价\(万元\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item
                            ) else reFunction(
                                '成交价(万元)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 土地用途
                            TDYT_68 = reFunction(
                                '土地用途\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 土地面积(公顷)
                            TDMJ_71 = reFunction(
                                '土地\s*面积\s*\(公顷\)\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 出让年限
                            CRNX_73 = reFunction(
                                '出让年限\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                                item)
                            # 备注
                            BZ_75 = reFunction(
                                '备注:\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                item)
                            if '二' in BZ_75:
                                BZ_75 = ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #5
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 共有字段
            fileTitle = data.xpath(
                '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()'
            ).extract_first()
            # 正文标题
            textTitle = data.xpath(
                '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()'
            ).extract_first()
            supllyType = response.meta.get('supllyType').strip()
            administration = response.meta.get('administration').strip()
            supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip()
            publishTime = response.meta.get('publishTime').strip()
            projectName = ''
            parcelNumber = ''
            parcelLocation = ''
            landPurpose = ''
            landArea = ''
            transferTimeLimit = ''
            transferPrice = ''
            landPurposeDetail = ''
            transferUnit = ''
            remark = ''
            publicityPeriod = ''
            contactUnit = ''
            unitAddr = ''
            postalCode = ''
            contactTel = ''
            contacter = ''
            email = ''
            lanServiceCondition = ''

            # 公告类型
            # noticeType =
            # 公示期
            publicityPeriod = reFunction(u'公示期:([\s\S]*)三、',
                                         reFunction('四、[\s\S]*',
                                                    items)).strip()
            # 联系单位
            contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址',
                                     reFunction('四、[\s\S]*', items)).strip()
            # 单位地址
            unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码',
                                  reFunction('四、[\s\S]*', items)).strip()
            # 邮政编码
            postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系电话
            contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系人
            contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件',
                                   reFunction('四、[\s\S]*', items)).strip()
            # 电子邮件
            email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)',
                               reFunction('四、[\s\S]*', items)).strip()
            if '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip()
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')
                    yield
                    #TODO
            elif '地块编号' in items:
                for item in [
                        '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('地块编号')[1:]
                ]:
                    # 地块编号
                    parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')

            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #6
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_1 = ''
            FBSJ_2 = ''
            WZLY_3 = ''
            SYH_4 = ''
            XXFL_5 = ''
            FBJG_6 = ''
            FBRQ_7 = ''
            WH_8 = ''
            SFYX_9 = ''
            XXMC_10 = ''
            ZWBT_11 = ''
            ZDBH_12 = ''
            ZDZMJ_13 = ''
            ZDZL_14 = ''
            SYNX_15 = ''
            CRNX_16 = ''
            RJL_17 = ''
            JZMD_18 = ''
            LDL_19 = ''
            JZXG_20 = ''
            TDYT_21 = ''
            TZQD_22 = ''
            BZJ_23 = ''
            GJBGBAH_24 = ''
            QSJ_25 = ''
            JJFD_26 = ''
            GPKSSJ_27 = ''
            GPJZSJ_28 = ''
            HQCRWJSJ_29 = ''
            HQCRWJDD_30 = ''
            BMSJ_31 = ''
            BMDD_32 = ''
            BZJJZSJ_33 = ''
            QRJMZGSJ_34 = ''
            LXDZ_35 = ''
            LXR_36 = ''
            LXDH_37 = ''
            KHDW_38 = ''
            KHYH_39 = ''
            YHZH_40 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_1 = data.xpath(
                '//div[@class="title"]/h1/text()').extract_first()
            # 发布时间  reFunction('', items)
            FBSJ_2 = reFunction(
                '(\d{4}年\d{2}月\d{2}日 \d{2}:\d{2})\';',
                data.xpath('//div[@class="toolbar"]/script[1]/text()').
                extract_first())
            # 文章来源
            WZLY_3 = reFunction(
                f'document.write\(\'文章来源:([{self.reStr}]*)\'\);',
                data.xpath('//div[@class="toolbar"]/script[2]/text()').
                extract_first())
            # 索引号
            SYH_4 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[1]/span/text()'
            ).extract_first()
            # 信息分类
            XXFL_5 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[2]/span/text()'
            ).extract_first()
            # 发布机构
            FBJG_6 = reFunction(
                f'str_1 = "([{self.reStr}]*)";',
                data.xpath(
                    '//div[@class="xxgk_xl_top"]/ul/li[3]/span/script/text()').
                extract_first())
            # 发文日期
            FBRQ_7 = reFunction(
                f'str_1 = "([{self.reStr}]*)";',
                data.xpath(
                    '//div[@class="xxgk_xl_top"]/ul/li[4]/span/script/text()').
                extract_first())
            # 文号
            WH_8 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[5]/span/text()'
            ).extract_first()
            # 是否有效
            SFYX_9 = reFunction(
                f'var  isok=\'([{self.reStr}]*)\';',
                data.xpath('//div[@class="xxgk_xl_top"]/ul/li[6]/script/text()'
                           ).extract_first())
            # 信息名称
            XXMC_10 = data.xpath(
                '//div[@class="xxgk_xl_top"]/ul/li[7]/span/text()'
            ).extract_first()
            # 正文标题
            ZWBT_11 = data.xpath(
                '//tr[@class="firstRow"]/td/text()').extract_first()

            if '主要规划指标' not in items:
                # item_ = reFunction('一、[\s\S]*二、', items)
                for item in [
                        '宗地编号' + _ for _ in re.findall('一([\s\S]*)二', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 联系电话
                    LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s*开户单位',
                                         reFunction('八、[\s\S]*', items))
                    # 宗地编号 / 地块编号
                    ZDBH_12_ = '|'.join(
                        re.findall(
                            f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积',
                            item))
                    ZDBH_12 += '|' + ZDBH_12_ if ZDBH_12_ else '|' + '|'.join(
                        re.findall(
                            f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*',
                            item))
                    # 宗地总面积 / 挂牌面积(m2)
                    ZDZMJ_13_ = '|'.join(
                        re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落',
                                   item))
                    ZDZMJ_13 += '|' + ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join(
                        re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*',
                                   item))
                    # 土地坐落 / 宗地坐落
                    ZDZL_14 += '|' + '|'.join(
                        re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限',
                                   item))
                    # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item))

                    # 岀让年限
                    CRNX_16_ = '|'.join(
                        re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率',
                                   item))
                    CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_)
                    # CRNX_16 += '|' + CRNX_16_ if CRNX_16_ else '|'.join(re.findall(f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 容积率
                    RJL_17 += '|' + '|'.join(
                        re.findall(
                            f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)',
                            item))
                    # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 建筑密度( %) / 建筑密度
                    JZMD_18 += '|' + '|'.join(
                        re.findall(
                            f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率',
                            item))
                    # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 绿地率 / | 绿化率( %)
                    LDL_19 += '|' + '|'.join(
                        re.findall(
                            f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高',
                            item))
                    # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 建筑限高 / 建筑限高(米)
                    JZXG_20 += '|' + '|'.join(
                        re.findall(
                            f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细',
                            item))
                    # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 土地用途明细 / 土地用途
                    TDYT_21 += '|' + '|'.join(
                        re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度',
                                   item))
                    # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 投资强度
                    TZQD_22 += '|' + '|'.join(
                        re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金',
                                   item))
                    # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 保证金(万元) / 保证金
                    BZJ_23 += '|' + '|'.join(
                        re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号',
                                   item))
                    # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 估价报告备案号
                    GJBGBAH_24_ = '|'.join(
                        re.findall(
                            f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件',
                            item))
                    GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join(
                        re.findall(f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价',
                                   item))
                    GJBGBAH_24 += '|' + reFunction('^\w{10, 16}', GJBGBAH_24__)

                    # 起始价 / 起始价(万元)
                    QSJ_25 += '|' + '|'.join(
                        re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度',
                                   item))
                    # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 加价幅度
                    JJFD_26 += '|' + '|'.join(
                        re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间',
                                   item))
                    # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item))
                    # 挂牌开始时间
                    GPKSSJ_27 += '|' + '|'.join(
                        re.findall(
                            f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间',
                            item))
                    # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item))
                    # 挂牌截止时间
                    GPJZSJ_28 += '|' + '|'.join(
                        re.findall(
                            f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)',
                            item))
                    # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item))
            else:
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                if not table:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二', items)[0].split('宗地编号')[1:]
                    ]:
                        # 联系电话
                        LXDH_37 = reFunction(
                            f'联系电话:\s*([{self.reStr}]*)\s*开户单位',
                            reFunction('八、[\s\S]*', items))
                        # 宗地编号 / 地块编号
                        ZDBH_12_ = '|'.join(
                            re.findall(
                                f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)宗地总面积',
                                item))
                        ZDBH_12__ = ZDBH_12_ if ZDBH_12_ else '|' + '|'.join(
                            re.findall(
                                f'[宗地块]*(?:[\s]*)编号:(?:[\s]*)([{self.reStr}]*)\s*',
                                item))
                        ZDBH_12 += ZDBH_12__
                        # 宗地总面积 / 挂牌面积(m2)
                        ZDZMJ_13_ = '|'.join(
                            re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)宗地坐落',
                                       item))
                        ZDZMJ_13__ = ZDZMJ_13_ if ZDZMJ_13_ else '|' + '|'.join(
                            re.findall(f'宗地总面积:(?:[\s]*)([{self.reStr}]*)\s*',
                                       item))
                        ZDZMJ_13 += ZDZMJ_13__
                        # 土地坐落 / 宗地坐落
                        ZDZL_14 += '|' + '|'.join(
                            re.findall(
                                f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*出让年限',
                                item))
                        # ZDZL_14 += '|' + ZDZL_14_ if ZDZL_14_ else '|'.join(re.findall(f'宗地坐落:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 岀让年限
                        CRNX_16_ = '|'.join(
                            re.findall(
                                f'出让年限:(?:[\s]*)([{self.reStr}]*)\s*容积率',
                                item))
                        CRNX_16 += '|' + reFunction('^[|]*\d{1,3}年', CRNX_16_)
                        # 容积率
                        RJL_17 += '|' + '|'.join(
                            re.findall(
                                f'容积率:(?:[\s]*)([{self.reStr}]*)\s*建筑密度\(%\)',
                                item))
                        # RJL_17 += '|' + RJL_17_ if RJL_17_ else '|'.join(re.findall(f'容积率:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 建筑密度( %) / 建筑密度
                        JZMD_18 += '|' + '|'.join(
                            re.findall(
                                f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*绿化率',
                                item))
                        # JZMD_18 += '|' + JZMD_18_ if JZMD_18_ else '|'.join(re.findall(f'建筑密度\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 绿地率 / | 绿化率( %)
                        LDL_19 += '|' + '|'.join(
                            re.findall(
                                f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*建筑限高',
                                item))
                        # LDL_19 += '|' + LDL_19_ if LDL_19_ else '|'.join(re.findall(f'绿化率\(%\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 建筑限高 / 建筑限高(米)
                        JZXG_20 += '|' + '|'.join(
                            re.findall(
                                f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*土地用途明细',
                                item))
                        # JZXG_20 += '|' + JZXG_20_ if JZXG_20_ else '|'.join(re.findall(f'建筑限高\(米\):(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 土地用途明细 / 土地用途
                        TDYT_21 += '|' + '|'.join(
                            re.findall(
                                f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*投资强度',
                                item))
                        # TDYT_21 += '|' + TDYT_21_ if TDYT_21_ else '|'.join(re.findall(f'土地用途明细:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 投资强度
                        TZQD_22 += '|' + '|'.join(
                            re.findall(
                                f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*保证金',
                                item))
                        # TZQD_22 += '|' + TZQD_22_ if TZQD_22_ else '|'.join(re.findall(f'投资强度:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 保证金(万元) / 保证金
                        BZJ_23 += '|' + '|'.join(
                            re.findall(
                                f'保证金:(?:[\s]*)([{self.reStr}]*)\s*估价报告备案号',
                                item))
                        # BZJ_23 += '|' + BZJ_23_ if BZJ_23_ else '|'.join(re.findall(f'保证金:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 估价报告备案号  现状土地条件
                        GJBGBAH_24_ = '|'.join(
                            re.findall(
                                f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*现状土地条件',
                                item))
                        GJBGBAH_24__ = '|' + GJBGBAH_24_ if GJBGBAH_24_ else '|'.join(
                            re.findall(
                                f'估价报告备案号(?:[\s]*)([{self.reStr}]*)\s*起始价',
                                item))
                        GJBGBAH_24 += '|' + reFunction('^\w{10, 16}',
                                                       GJBGBAH_24__)

                        # 起始价 / 起始价(万元)
                        QSJ_25 += '|' + '|'.join(
                            re.findall(
                                f'起始价:(?:[\s]*)([{self.reStr}]*)\s*加价幅度',
                                item))
                        # QSJ_25 += '|' + QSJ_25_ if QSJ_25_ else '|'.join(re.findall(f'起始价:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 加价幅度
                        JJFD_26 += '|' + '|'.join(
                            re.findall(
                                f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s*挂牌开始时间',
                                item))
                        # JJFD_26 += '|' + JJFD_26_ if JJFD_26_ else '|'.join(re.findall(f'加价幅度:(?:[\s]*)([{self.reStr}]*)\s', item))
                        # 挂牌开始时间
                        GPKSSJ_27 += '|' + '|'.join(
                            re.findall(
                                f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*挂牌截止时间',
                                item))
                        # GPKSSJ_27 += '|' + GPKSSJ_27_ if GPKSSJ_27_ else '|'.join(re.findall(f'挂牌开始时间:(?:[\s]*)([{self.reStr}]*)\s*', item))
                        # 挂牌截止时间
                        GPJZSJ_28 += '|' + '|'.join(
                            re.findall(
                                f'挂牌截止时间:(?:[\s]*)([{self.reStr}]*)\s*(?:宗地编号|二)',
                                item))
                        # GPJZSJ_28 += '|' + GPJZSJ_28_ if GPJZSJ_28_ else '|'.join(re.findall(f'挂牌截止时间:(?:[\s]*)([{reStr}]*)(?:宗地编号|二|\s*)', item))
                else:
                    # 联系电话
                    LXDH_37 = reFunction(f'联系电话:\s*([{self.reStr}]*)\s',
                                         reFunction('八|七、[\s\S]*', items))
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号 / 地块编号
                    ZDBH_12 = tdData.get('地块编号')
                    # 宗地总面积 / 挂牌面积(m2)
                    ZDZMJ_13 = tdData.get(r'挂牌面积(m2)')
                    # 土地坐落 / 宗地坐落
                    ZDZL_14 = tdData.get('土地坐落')
                    # 使用年限
                    SYNX_15 = tdData.get('使用年限')
                    # 起始价 / 起始价(万元)
                    QSJ_25 = tdData.get('起始价(万元)')
                    # 土地用途明细 / 土地用途
                    TDYT_21 = tdData.get('土地用途')
                    # 保证金(万元) / 保证金
                    BZJ_23 = tdData.get('保证金(万元)')
                    ZYGHZB = tdData.get('主要规划指标')
                    # 容积率
                    RJL_17 = reFunction(
                        '容积率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)
                    # 建筑密度( %) / 建筑密度
                    JZMD_18 = reFunction(
                        '建筑密度[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)容积率',
                        ZYGHZB)
                    # 绿地率 / | 绿化率( %)
                    LDL_19 = reFunction(
                        '绿地率[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)
                    # 建筑限高 / 建筑限高(米)
                    JZXG_20 = reFunction(
                        '建筑限高[:]*\s*([()\w\.:: \(\)〔〕≤≥\-\/\%,、\.﹪]*)[;。,]?',
                        ZYGHZB)

            # TODO
            # 获取出让文件时间
            HQCRWJSJ_29 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到',
                                     reFunction('四、[\s\S]*五、', items))
            # 获取出让文件地点
            HQCRWJDD_30 = reFunction(
                f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)获取 挂牌',
                reFunction('四、[\s\S]*五、', items))
            # 报名时间
            BMSJ_31 = reFunction(f'申请人可于(?:[\s]*)([{self.reStr}]*)到',
                                 reFunction('五、[\s\S]*六、', items))
            # 报名地点
            BMDD_32 = reFunction(
                f'申请人可于(?:[\s]*)(?:[{self.reStr}]*)到\s*([{self.reStr}]*)向我局提交书面申请',
                reFunction('五、[\s\S]*六、', items))
            # 保证金截止时间
            BZJJZSJ_33 = reFunction(f'截止时间为(?:[\s]*)([{self.reStr}]*)\s*。经审',
                                    reFunction('五、[\s\S]*六、', items))
            # 确认竞买资格时间
            QRJMZGSJ_34 = reFunction(f'我局将在\s*([{self.reStr}]*)\s*前确认其竞买资格',
                                     reFunction('五、[\s\S]*六、', items))

            # TODO 联系地址
            LXDZ_35 = reFunction(f'联系地址:\s*([{self.reStr}]*)\s*联 系',
                                 reFunction('八、[\s\S]*', items))
            # 联系人
            LXR_36 = reFunction(f'联 系\s*人:\s*([{self.reStr}]*)\s*联系电话',
                                reFunction('八、[\s\S]*', items))
            # 开户单位
            KHDW_38 = reFunction(f'开户单位:\s*([{self.reStr}]*)\s*开户银行',
                                 reFunction('八、[\s\S]*', items))
            # 开户银行
            KHYH_39 = reFunction(f'开户银行:\s*([{self.reStr}]*)\s*银行帐号',
                                 reFunction('八、[\s\S]*', items))
            # 银行帐号
            YHZH_40 = reFunction(
                '^\d{17}',
                reFunction(f'银行帐号:\s*([{self.reStr}]*)\s*',
                           reFunction('八、[\s\S]*', items)))

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_1,
                FBSJ_2,
                WZLY_3,
                SYH_4,
                XXFL_5,
                FBJG_6,
                FBRQ_7,
                WH_8,
                SFYX_9,
                XXMC_10,
                ZWBT_11,
                ZDBH_12,
                ZDZMJ_13,
                ZDZL_14,
                SYNX_15,
                CRNX_16,
                RJL_17,
                JZMD_18,
                LDL_19,
                JZXG_20,
                TDYT_21,
                TZQD_22,
                BZJ_23,
                GJBGBAH_24,
                QSJ_25,
                JJFD_26,
                GPKSSJ_27,
                GPJZSJ_28,
                HQCRWJSJ_29,
                HQCRWJDD_30,
                BMSJ_31,
                BMDD_32,
                BZJJZSJ_33,
                QRJMZGSJ_34,
                LXDZ_35,
                LXR_36,
                LXDH_37,
                KHDW_38,
                KHYH_39,
                YHZH_40,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #7
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 共有字段
            signTime = response.meta.get('signTime')
            administration = response.meta.get('administration')
            parcelLocation = response.meta.get('parcelLocation')
            totalArea = response.meta.get('totalArea')
            # detailPage
            # 项目名称
            projectName = reFunction('项目名称:(?:\s*)([\s\S]*)项目位置',
                                     items).strip()
            # 项目位置
            projectLocation = reFunction('项目位置:(?:\s*)([\s\S]*)面积(公顷)',
                                         items).strip()
            # 面积(公顷)
            area = reFunction('面积(公顷):(?:\s*)([\s\S]*)土地来源', items).strip()
            # 土地来源
            landSource = reFunction('土地来源:(?:\s*)([\s\S]*)土地用途', items).strip()
            # 土地用途
            landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)供地方式',
                                     items).strip()
            # 供地方式
            supplyType = reFunction('供地方式:(?:\s*)([\s\S]*)土地使用年限',
                                    items).strip()
            # landUsegeTerm
            landUsegeTerm = reFunction('土地使用年限:(?:\s*)([\s\S]*)行业分类',
                                       items).strip()
            # 行业分类
            classification = reFunction('行业分类:(?:\s*)([\s\S]*)土地级别',
                                        items).strip()
            # 土地级别
            landLevel = reFunction('行业分类:(?:\s*)([\s\S]*)成交价格', items).strip()
            # 成交价格(万元)
            transferPrice = reFunction('成交价格(万元):(?:\s*)([\s\S]*)分期支付约定',
                                       items).strip()
            # TODO
            stagesData = reFunction('分期支付约定:(?:\s*)([\s\S]*)土地使用权人',
                                    items).strip()
            # 分期支付约定-支付期号
            issue = ''
            # 分期支付约定-约定支付日期
            paymentDate = '|'.join([
                strfTime(_) for _ in list(
                    filter(None, re.findall('\d{4}年\d{2}月\d{2}日', stagesData)))
            ])
            # 分期支付约定-约定支付金额(万元)
            paymentAmount = ''
            # 分期支付约定-备注
            remark = ''
            for _ in range(
                    0, len(list(filter(None, re.findall('年', stagesData))))):
                # id 一定是从 9 开始 如果有多个项, 用Xpath一一匹配
                id = _ + 9
                issue += data.xpath(f'//*[@id="r-{id}-0"]/td[1]/text()'
                                    ).extract_first() + '|' if data.xpath(
                                        f'//*[@id="r-{id}-0"]/td[1]/text()'
                                    ).extract_first() else ' '
                paymentAmount += data.xpath(
                    f'//*[@id="r-{id}-0"]/td[3]/text()').extract_first(
                    ) + '|' if data.xpath(f'//*[@id="r-{id}-0"]/td[3]/text()'
                                          ).extract_first() else ' '
                remark += data.xpath(f'//*[@id="r-{id}-0"]/td[4]/text()'
                                     ).extract_first() + '|' if data.xpath(
                                         f'//*[@id="r-{id}-0"]/td[4]/text()'
                                     ).extract_first() else ' '

            # TODO
            # 土地使用权人
            landHolder = reFunction('土地使用权人:(?:\s*)([\s\S]*)约定容积率',
                                    items).strip()
            # 约定容积率上限
            plotRatioLOWER = reFunction('上限:(?:\s*)([\s\S]*)约定交地时间',
                                        items).strip()
            # 约定容积率下限
            plotRatioUP = reFunction('下限:(?:\s*)([\s\S]*)上限', items).strip()
            # 约定交地时间
            agreedDeliveryTime = strfTime(
                reFunction('约定交地时间:(?:\s*)([\s\S]*)约定开工时间', items).strip())
            # 约定开工时间
            agreedStartTime = strfTime(
                reFunction('约定开工时间:(?:\s*)([\s\S]*)约定竣工时间', items).strip())
            # 约定竣工时间
            agreedCompletionTime = strfTime(
                reFunction('约定竣工时间:(?:\s*)([\s\S]*)实际开工时间', items).strip())
            # 实际开工时间
            actualStartTime = strfTime(
                reFunction('实际开工时间:(?:\s*)([\s\S]*)实际竣工时间', items).strip())
            # 实际竣工时间
            actualCompletionTime = strfTime(
                reFunction('实际竣工时间:(?:\s*)([\s\S]*)批准单位', items).strip())
            # 批准单位
            approvedUnit = reFunction('批准单位:(?:\s*)([\s\S]*)合同签订日期',
                                      items).strip()
            # 合同签订日期
            contractTime = strfTime(
                reFunction('合同签订日期:(?:\s*)([\s\S]*)', items).strip())
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(landSource + projectLocation + projectName +
                                  url)

            csvFile = [
                administration, parcelLocation, totalArea, landPurpose,
                signTime, projectName, projectLocation, area, landSource,
                supplyType, landUsegeTerm, classification, landLevel,
                transferPrice, issue, paymentDate, paymentAmount, remark,
                landHolder, plotRatioUP, plotRatioLOWER, agreedDeliveryTime,
                agreedStartTime, agreedCompletionTime, actualStartTime,
                actualCompletionTime, approvedUnit, contractTime, crawlingTime,
                url, md5Mark, '\n'
            ]
            # 存储数据
            self.fileDetail.write(','.join([
                _.replace(',', ' ').replace('\n', '').replace('\r', '')
                if _ else _ for _ in csvFile
            ]))
            self.fileDetail.write('\n')
            yield
            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #8
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            BT_47 = ''
            LY_55 = ''
            LYSJ_48 = ''
            XH_49 = ''
            PZWH_50 = ''
            YDDW_51 = ''
            GDFS_52 = ''
            PZSJ_53 = ''
            WZ_54 = ''
            YT_55 = ''
            MJ_56 = ''
            RJL_57 = ''
            GYWAFA_58 = ''

            # TODO 共有字段
            # 标题
            BT_47 = response.meta.get('title')
            LY = data.xpath(
                '//div[@class="content-small-title"]/text()').extract_first()
            # 来源
            LY_55 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY)
            # 时间
            LYSJ_48 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 解析 table 若出错 使用正则
            htmlTable = htmlTableTransformer()
            if '宗地编号' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find_all('table')[0]

                    if not table.tbody.find_all('tr')[0].find_all(
                            text=re.compile("序号|受让人")):
                        table.tbody.find_all('tr')[0].extract()
                    tdsData = htmlTable.tableTrTdRegulationToList(table)

                    for _ in range(len(list(tdsData.values())[0])):
                        # if response.url == 'http://zzland.zhengzhou.gov.cn/hbgd/1715241.jhtml':
                        #     print()
                        # 序号
                        XH_49 = tdsData.get('序号')[_] if tdsData.get(
                            '序号') else ''
                        # 批准文号
                        PZWH_50 = tdsData.get('批准文号')[_] if tdsData.get(
                            '批准文号') else ''
                        # 用地单位
                        YDDW_51_ = tdsData.get('用地单位(受让人)')[_] if tdsData.get(
                            '用地单位(受让人)') else tdsData.get(
                                '受让人')[_] if tdsData.get('受让人') else ''
                        YDDW_51 = YDDW_51_ if YDDW_51_ else tdsData.get(
                            '单位')[_]
                        # 供地方式
                        GDFS_52 = tdsData.get('供地方式')[_] if tdsData.get(
                            '供地方式') else tdsData.get('供应方式')[_] if tdsData.get(
                                '供应方式') else ''
                        # 批准时间
                        PZSJ_53 = tdsData.get('批准时间')[_] if tdsData.get(
                            '批准时间') else tdsData.get('签订日期')[_] if tdsData.get(
                                '签订日期') else ''
                        # 位置
                        WZ_54_0 = tdsData.get('土地位置')
                        WZ_54_1 = tdsData.get('土地座落')
                        WZ_54_2 = tdsData.get('宗地位置')
                        WZ_54_3 = tdsData.get('位置')
                        WZ_54_ = list(
                            filter(None, [WZ_54_0, WZ_54_1, WZ_54_2, WZ_54_3]))
                        WZ_54 = WZ_54_[0][_] if WZ_54_ else ''
                        # 用途
                        YT_55_0 = tdsData.get('用途')
                        YT_55_1 = tdsData.get('土地用途')
                        YT_55_2 = tdsData.get('用途明细')
                        YT_55_ = list(filter(None,
                                             [YT_55_0, YT_55_1, YT_55_2]))
                        YT_55 = YT_55_[0][_] if YT_55_ else ''
                        # 面积
                        MJ_56_0 = tdsData.get('面积(平方米)')
                        MJ_56_1 = tdsData.get('划拨面积')
                        MJ_56_2 = tdsData.get('出让/划拨面积')
                        MJ_56_3 = tdsData.get('面积(公顷)')
                        MJ_56_ = list(
                            filter(None, [MJ_56_0, MJ_56_1, MJ_56_2, MJ_56_3]))
                        MJ_56 = MJ_56_[0][_] if MJ_56_ else ''
                        # 容积率
                        RJL_57 = tdsData.get('容积率')[_] if tdsData.get(
                            '容积率') else ''
                        # 供应方案文号
                        GYWAFA_58 = tdsData.get('供应方案文号')[_] if tdsData.get(
                            '供应方案文号') else ''
                        # 爬取时间
                        crawlingTime = time.strftime("%Y-%m-%d",
                                                     time.localtime())
                        # 爬取地址url
                        url = response.url
                        # 唯一标识
                        md5Mark = encrypt_md5(url + BT_47 + LYSJ_48)

                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                BT_47,
                                LY_55,
                                LYSJ_48,
                                XH_49,
                                PZWH_50,
                                YDDW_51,
                                GDFS_52,
                                PZSJ_53,
                                WZ_54,
                                YT_55,
                                MJ_56,
                                RJL_57,
                                GYWAFA_58,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                except Exception as e:
                    pass
            else:
                # 进行正则匹配
                # 序号
                XH_49 = reFunction(f'宗地编号([{self.reStr}]*)地块位置', items)
                # 用地单位
                YDDW_51 = reFunction(f'受让单位([{self.reStr}]*)备注:', items)
                # 位置
                WZ_54 = reFunction(f'地块位置([{self.reStr}]*)土地用途', items)
                # 用途
                YT_55 = reFunction(f'土地用途([{self.reStr}]*)土地面积', items)
                # 面积
                MJ_56 = reFunction(f'土地面积\(公顷\)([{self.reStr}]*)项目名称', items)
                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + BT_47 + LYSJ_48)

                # 是否需要判断重复 请求
                if DUPLICATE_SWITCH:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    # 重复效验通过, 存储数据
                    csvFile = [
                        BT_47,
                        LY_55,
                        LYSJ_48,
                        XH_49,
                        PZWH_50,
                        YDDW_51,
                        GDFS_52,
                        PZSJ_53,
                        WZ_54,
                        YT_55,
                        MJ_56,
                        RJL_57,
                        GYWAFA_58,
                        crawlingTime,
                        url,
                        md5Mark,
                    ]
                    results = ''
                    for _ in csvFile:
                        try:
                            if _ and _ != '|' * len(_):
                                results += _.replace(',', ' ').replace(
                                    '\n', '').replace('\r', '').replace(
                                        r'\xa0', '').replace('\xa0', '') + ','
                            else:
                                results += ','
                        except Exception as e:
                            results += ','
                            self.log(
                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                level=logging.ERROR)
                    with open(self.pathDetail, 'a+') as fp:
                        fp.write(results)
                        fp.write('\n')
                    self.log(f'数据获取成功', level=logging.INFO)
                    yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #9
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            GGLX_1 = ''
            WJBT_2 = ''
            SJ_3 = ''
            LY_4 = ''
            ZWBT_5 = ''
            ZDBH_6 = ''
            TDWZ_7 = ''
            CRMJ_8 = ''
            LHYD_9 = ''
            DLYD_10 = ''
            TDYT_11 = ''
            CRNX_12 = ''
            RJL_13 = ''
            JZMD_14 = ''
            LDL_15 = ''
            JZKJ_16 = ''
            QSJ_17 = ''
            BZJ_18 = ''
            JJFD_19 = ''
            BMRQ_20 = ''
            GPRQ_21 = ''
            GPJZSJ_22 = ''
            BZJDZSJ_23 = ''
            LXDZ_24 = ''
            LXR_25 = ''
            LXDH_26 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 公告类型
            GGLX_1 = '出让公告'
            # 文件标题
            WJBT_2 = response.meta.get('title')
            # 时间
            SJ_3 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_4 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 正文标题
            ZWBT_5 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_2 + SJ_3)
            # 报名时间起止日期
            BMRQ_20 = reFunction(f'报名申请时间:\s*([\w]*);', items) if reFunction(
                f'报名申请时间:\s*([\w]*);', items
            ) else reFunction(f'申请人可于(\w*),向我局提交书面申请', items) if reFunction(
                f'申请人可于(\w*),向我局提交书面申请', items
            ) else reFunction(f'申请时间为:(\w*)', items) if reFunction(
                f'申请时间为:(\w*)', items) else reFunction(f'申请人可于(\w*)到', items)
            GPTime = reFunction(f'网上挂牌(报价)时间:\s*([\w]*)', items) if reFunction(
                f'网上挂牌(报价)时间:\s*([\w]*)', items) else reFunction(
                    f'挂牌时间为:\s*([\w]*)', items)
            try:
                if GPTime:
                    # 挂牌开始时间
                    GPRQ_21 = GPTime.split('至')[0]
                    # 挂牌截止时间
                    GPJZSJ_22 = GPTime.split('至')[1]
                else:
                    GPRQ_21 = reFunction(f'挂牌时间为:\s*([\s\S]*)',
                                         reFunction('六、([\s\S]*)七、', items))
                    GPJZSJ_22 = reFunction(f'挂牌时间为:\s*([\s\S]*)',
                                           reFunction('六、([\s\S]*)七、', items))
            except Exception as e:
                self.log(f'详情页数据挂牌时间解析失败, 请求:{response.url}, 信息: {e}',
                         level=logging.DEBUG)
                GPRQ_21 = ''
                GPJZSJ_22 = ''
            # 保证金到账截止时间
            BZJDZSJ_23 = reFunction(
                f'保证金到账截止时间为:\s*([\w]*)', items) if reFunction(
                    f'保证金到账截止时间为:\s*([\w]*)', items) else reFunction(
                        f'保证金交纳截止时间:\s*([\w]*)', items) if reFunction(
                            f'保证金交纳截止时间:\s*([\w]*)', items) else reFunction(
                                f'保证金的截止时间为\s*([\w]*)', items)
            # 联系地址
            LXDZ_24 = reFunction(
                '联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                items) if reFunction(
                    f'联系地址:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                    items) else reFunction(
                        '单位地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            # 联系人
            LXR_25 = reFunction(
                f'联\s系\s人:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            # 联系电话
            LXDH_26 = reFunction(
                f'联系电话:\s*([()\w\.:: \(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)', items)
            if '挂牌出让宗地的基本情况和规划指标等要求' not in items and '宗地编号' not in items:
                # 处理 table 情况
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                try:
                    tdReplace = table.tbody.find_all('tr')[0].find(
                        'td',
                        colspan='4') if table.tbody.find_all('tr')[0].find(
                            'td', colspan='4') else table.tbody.find_all(
                                'tr')[0].find('td', colspan="2")
                    number = table.tbody.find_all('tr')[0].index(tdReplace)
                    tdList = table.tbody.find_all('tr')[1].find_all('td')
                    for _ in range(1, len(tdList) + 1):
                        table.tbody.find_all('tr')[0].insert(
                            number + _, tdList[_ - 1])
                    tdReplace.extract()
                    table.tbody.find_all('tr')[1].extract()
                except:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    tdReplace = table.thead.find_all('tr')[0].find(
                        'td',
                        colspan='4') if table.thead.find_all('tr')[0].find(
                            'td', colspan='4') else table.thead.find_all(
                                'tr')[0].find('td', colspan="2")
                    number = table.thead.find_all('tr')[0].index(tdReplace)
                    tdList = table.thead.find_all('tr')[1].find_all('td')
                    for _ in range(1, len(tdList) + 1):
                        table.thead.find_all('tr')[0].insert(
                            number + _, tdList[_ - 1])
                    tdReplace.extract()
                    table.thead.find_all('tr')[1].extract()
                    table.tbody.insert(
                        0,
                        table.thead.find_all('tr')[0])  # 插入 thead 的内容
                    table.thead.extract()
                htmlTable = htmlTableTransformer()
                try:
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    if not tdData and 'thead' in items:  # 如果没有拿到 则可能存在 thead
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace = table.thead.find_all('tr')[0].find(
                            'td',
                            colspan='4') if table.thead.find_all('tr')[0].find(
                                'td', colspan='4') else table.thead.find_all(
                                    'tr')[0].find('td', colspan="2")
                        number = table.thead.find_all('tr')[0].index(tdReplace)
                        tdList = table.thead.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.thead.find_all('tr')[0].insert(
                                number + _, tdList[_ - 1])
                        tdReplace.extract()
                        table.thead.find_all('tr')[1].extract()
                        table.tbody.insert(
                            0,
                            table.thead.find_all('tr')[0])  # 插入 thead 的内容
                        table.thead.extract()
                        htmlTable = htmlTableTransformer()
                except:
                    tdData = {}
                for _ in range(len(list(tdData.values())[0])):
                    # 宗地编号
                    ZDBH_6 = tdData.get('编号')[_] if tdData.get('编号') else ''
                    # 土地位置
                    TDWZ_7 = tdData.get('土地位置')[_] if tdData.get(
                        '土地位置') else ''
                    # 出让面积(m2)
                    CRMJ_8_0 = tdData.get('土地面积')
                    CRMJ_8_1 = tdData.get('土地面积(平方米)')
                    CRMJ_8_ = list(filter(None, [CRMJ_8_0, CRMJ_8_1]))
                    CRMJ_8 = CRMJ_8_[0][_] if CRMJ_8_ else ''
                    # 土地用途
                    TDYT_11 = tdData.get('土地用途')[_] if tdData.get(
                        '土地用途') else ''
                    # 岀让年限
                    CRNX_12 = tdData.get('出让年限(年)')[_] if tdData.get(
                        '出让年限(年)') else ''
                    # 容积率
                    RJL_13 = tdData.get('容积率')[_] if tdData.get(
                        '容积率') else tdData.get('容 积 率')[_] if tdData.get(
                            '容 积 率') else ''
                    # 建筑密度
                    # JZMD_14
                    # 绿地率
                    LDL_15 = tdData.get('绿化率')[_] if tdData.get('绿化率') else ''
                    # 建筑空间
                    JZKJ_16 = tdData.get('控制高度(m)')[_] if tdData.get(
                        '控制高度(m)') else tdData.get('建筑限高(m)')[_] if tdData.get(
                            '建筑限高(m)') else ''
                    # 起始价(万元)
                    QSJ_17 = tdData.get('挂牌起始价(万元)')[_] if tdData.get(
                        '挂牌起始价(万元)') else ''
                    # 保证金(万元)
                    BZJ_18 = tdData.get('竞买保证金(万元)')[_] if tdData.get(
                        '竞买保证金(万元)') else tdData.get(
                            '竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else ''
                    # 竞价幅度(万元)
                    JJFD_19 = tdData.get('増价幅度(万元/次)')[_] if tdData.get(
                        '増价幅度(万元/次)') else ''
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            # TODO 判断
            elif '挂牌出让宗地的基本情况和规划指标等要求' in items:
                for item in re.split(
                        '\d、',
                        reFunction('一、挂牌出让宗地的基本情况和规划指标等要求:([\s\S]*)二、',
                                   items)):
                    # TODO
                    if not item.strip():
                        continue
                    # 宗地编号
                    ZDBH_6 = reFunction(
                        f'^([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)宗地位于', item)
                    # 土地位置
                    TDWZ_7 = reFunction(
                        f'宗地位于([()\w\.:: \(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 出让面积(m2)
                    CRMJ_8 = reFunction(
                        f'土地出让面积([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 土地用途
                    TDYT_11 = reFunction(
                        f'宗地规划用途为([()\w\.:: %\(\)〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 岀让年限
                    CRNX_12 = reFunction(
                        f'宗地土地出让年期([()\w\.:: —\(\),〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)。',
                        item)
                    # 容积率
                    RJL_13 = reFunction(
                        f'容积率([()\w\.:: \(\)%〔〕㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 建筑密度
                    JZMD_14 = reFunction(
                        f'建筑密度([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 绿地率
                    LDL_15 = reFunction(
                        f'绿地率([()\w\.:: \(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 建筑空间
                    JZKJ_16 = reFunction(
                        f'建筑空间([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 起始价(万元)
                    QSJ_17 = reFunction(
                        f'本宗地起始价([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 保证金(万元)
                    BZJ_18 = reFunction(
                        f'竞买保证金([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item)
                    # 竞价幅度(万元)
                    JJFD_19 = reFunction(
                        f'增价幅度([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*)', item)
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '挂牌出让地块基本情况' in items and '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    ZDBH_6 = reFunction(
                        f'宗地编号为([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%;、\.﹪]*),', item)
                    # 土地位置
                    TDWZ_7 = reFunction(
                        f'该地块([()\w\.:: —\(\)〔〕%㎡≤≥《》,\-\/\%;、\.﹪]*)。出让面积',
                        item)
                    # 出让面积(m2)
                    CRMJ_8 = reFunction(
                        f'出让面积:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 绿化用地
                    LHYD_9 = reFunction(
                        f'绿化用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 道路用地
                    DLYD_10 = reFunction(
                        f'道路用地:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 土地用途
                    TDYT_11 = reFunction(
                        f'用途:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 岀让年限
                    CRNX_12 = reFunction(
                        f'出让年限:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 容积率
                    RJL_13 = reFunction(
                        f'容积率:*([()\w\.:: ,—\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 建筑密度
                    JZMD_14 = reFunction(
                        f'建筑密度:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item)
                    # 绿地率
                    LDL_15 = reFunction(
                        f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);', item
                    ) if reFunction(
                        f'绿地率:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);',
                        item) else reFunction(
                            f'绿地率(%)([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*);',
                            item)
                    # 起始价(万元)
                    QSJ_17 = reFunction(
                        f'起始价为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),', item)
                    # 保证金(万元)
                    BZJ_18 = reFunction(
                        f'竞买保证金为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*),',
                        item)
                    # 竞价幅度(万元)
                    JJFD_19 = reFunction(
                        f'竞价幅度为:*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)。', item)
                    # 是否需要判断重复 请求
                    if DUPLICATE_SWITCH:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_6 and TDYT_11:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                GGLX_1,
                                WJBT_2,
                                SJ_3,
                                LY_4,
                                ZWBT_5,
                                ZDBH_6,
                                TDWZ_7,
                                CRMJ_8,
                                LHYD_9,
                                DLYD_10,
                                TDYT_11,
                                CRNX_12,
                                RJL_13,
                                JZMD_14,
                                LDL_15,
                                JZKJ_16,
                                QSJ_17,
                                BZJ_18,
                                JJFD_19,
                                BMRQ_20,
                                GPRQ_21,
                                GPJZSJ_22,
                                BZJDZSJ_23,
                                LXDZ_24,
                                LXR_25,
                                LXDH_26,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            else:
                if '宗地编号' in items and '地块基本情况' not in items:
                    for item in [
                            '宗地编号' + _ for _ in re.findall(
                                '一([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                    ]:
                        # 宗地编号
                        ZDBH_6 = reFunction(
                            f'宗地编号:*\s*([()\w\.:: —\(\)〔〕%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 土地位置
                        TDWZ_7 = reFunction(
                            f'宗地坐落:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 出让面积(m2)
                        CRMJ_8 = reFunction(
                            f'宗地\s*总*面积:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 土地用途
                        TDYT_11 = reFunction(
                            f'土地用途[明细]*:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 岀让年限
                        CRNX_12 = reFunction(
                            f'出让年限:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 容积率
                        RJL_13 = reFunction(
                            f'容积率:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 建筑密度
                        JZMD_14 = reFunction(
                            f'建筑密度\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 绿地率
                        LDL_15 = reFunction(
                            f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item
                        ) if reFunction(
                            f'绿[地化]率\(%\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item
                        ) else reFunction(
                            f'绿地率(%)\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 建筑空间
                        JZKJ_16 = reFunction(
                            f'建筑限高\(米\):*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 起始价(万元)
                        QSJ_17 = reFunction(
                            f'起始价:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 保证金(万元)
                        BZJ_18 = reFunction(
                            f'保证金:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 竞价幅度(万元)
                        JJFD_19 = reFunction(
                            f'加价幅度:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 挂牌开始时间
                        GPRQ_21 = reFunction(
                            f'挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 挂牌截止时间
                        GPJZSJ_22 = reFunction(
                            f'挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕、,,;,、%㎡≤≥《》\-\/\%、\.﹪]*)\s',
                            item)
                        # 联系地址
                        LXDZ_24 = reFunction(
                            f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items).split('联')[0] if reFunction(
                                f'联系地址:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                                items) else ''
                        # 联系人
                        LXR_25 = reFunction(
                            f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items
                        ).split('联')[0] if reFunction(
                            f'联\s系\s人:\s*([()\w\.\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items) else ''
                        # 联系电话
                        LXDH_26 = reFunction(
                            f'联系电话:\s*([()\d\.:: \(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)',
                            items)
                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_6 and TDYT_11:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    GGLX_1,
                                    WJBT_2,
                                    SJ_3,
                                    LY_4,
                                    ZWBT_5,
                                    ZDBH_6,
                                    TDWZ_7,
                                    CRMJ_8,
                                    LHYD_9,
                                    DLYD_10,
                                    TDYT_11,
                                    CRNX_12,
                                    RJL_13,
                                    JZMD_14,
                                    LDL_15,
                                    JZKJ_16,
                                    QSJ_17,
                                    BZJ_18,
                                    JJFD_19,
                                    BMRQ_20,
                                    GPRQ_21,
                                    GPJZSJ_22,
                                    BZJDZSJ_23,
                                    LXDZ_24,
                                    LXR_25,
                                    LXDH_26,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                elif '地块基本情况' in items:
                    # todo
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find('table')
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    for _ in range(len(list(tdData.values())[0])):
                        # 宗地编号
                        ZDBH_6 = tdData.get('编号')[_] if tdData.get(
                            '编号') else ''
                        # 土地位置
                        TDWZ_7 = tdData.get('地块位置')[_] if tdData.get(
                            '地块位置') else ''
                        # 出让面积(m2)
                        CRMJ_8 = tdData.get('土地面积(亩)')[_] if tdData.get(
                            '土地面积(亩)') else ''
                        # 土地用途
                        TDYT_11 = tdData.get('土地用途')[_] if tdData.get(
                            '土地用途') else ''
                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_6 and TDYT_11:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    GGLX_1,
                                    WJBT_2,
                                    SJ_3,
                                    LY_4,
                                    ZWBT_5,
                                    ZDBH_6,
                                    TDWZ_7,
                                    CRMJ_8,
                                    LHYD_9,
                                    DLYD_10,
                                    TDYT_11,
                                    CRNX_12,
                                    RJL_13,
                                    JZMD_14,
                                    LDL_15,
                                    JZKJ_16,
                                    QSJ_17,
                                    BZJ_18,
                                    JJFD_19,
                                    BMRQ_20,
                                    GPRQ_21,
                                    GPJZSJ_22,
                                    BZJDZSJ_23,
                                    LXDZ_24,
                                    LXR_25,
                                    LXDH_26,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #10
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            # 数据获取不全
            categorynum = response.meta.get('categorynum')
            infoid = response.meta.get('infoid')
            targetUrl = "https://www.cqggzy.com/tiaozhuan.html?infoid=" + infoid + "&categorynum=" + categorynum
            results = ''
            for _ in range(5):
                try:
                    self.session.get(targetUrl,
                                     headers=self.header,
                                     allow_redirects=False,
                                     timeout=60)
                    redirectUrl = 'https://www.cqggzy.com/EpointWebBuilderService/getInfoListAndCategoryList.action?cmd=pageRedirect'
                    data = {'categorynum': categorynum, 'infoid': infoid}
                    response_ = self.session.post(redirectUrl,
                                                  headers=self.header,
                                                  data=data,
                                                  allow_redirects=False,
                                                  timeout=60)
                    url = 'https://www.cqggzy.com' + response_.json().get(
                        'custom') if 'http' not in response_.json().get(
                            'custom') else response_.json().get('custom')
                    results = self.session.get(url,
                                               headers=self.header,
                                               allow_redirects=False,
                                               timeout=60)
                    break
                except Exception as e:
                    pass
            data = Selector(text=results.content.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            GGXH_31 = ''
            ZDBH_32 = ''
            DKWZ_33 = ''
            TDYT_34 = ''
            TDMJ_35 = ''
            RJL_36 = ''
            JZMD_37 = ''
            LDL_38 = ''
            BZJ_39 = ''
            DJ_40 = ''
            JRZJMJ_41 = ''
            CRFS_42 = ''
            CRNX_43 = ''
            CJJ_44 = ''
            SRDW_45 = ''
            TDSYTJ_46 = ''
            JYSJ_47 = ''
            CJR_48 = ''
            BZ_49 = ''
            LXDW_50 = ''
            LXDZ_51 = ''
            LXDH_52 = ''
            GSQ_53 = ''

            # 共有字段
            # 文件标题
            WJBT_27 = data.xpath(
                '//*[@class="article-title"]/text()').extract_first()
            # 信息时间
            XXSJ_28 = reFunction(
                '(\d{4}-\d{1,2}-\d{1,2})',
                data.xpath(
                    '//*[@class="info-source"]/text()[1]').extract_first())
            # TODO
            # 正文标题
            ZWBT_29 = WJBT_27
            soup = BeautifulSoup(results.content.decode('utf-8'))
            table = soup.find('table')

            if '土地使用条件' in items or '宗地编号' in items or '公告序号' in items:
                # TODO 正则匹配的页面
                # 公告序号
                GGXH_31 = '|'.join(
                    re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                               items))
                # 宗地编号 / 编号
                ZDBH_32_ = '|'.join(
                    re.findall(
                        f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)',
                        items))
                ZDBH_32 = ZDBH_32_.replace(':', '') if ZDBH_32_ else ZDBH_32_
                # 地块位置
                DKWZ_33_ = '|'.join(
                    re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)', items))
                DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join(
                    re.findall(
                        '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 土地用途 / 用途
                TDYT_34 = '|'.join(
                    re.findall('[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                               items))
                # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                TDMJ_35_ = '|'.join(
                    re.findall(
                        '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)',
                        items))
                TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join(
                    re.findall(
                        f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)',
                        items))
                # 容积率
                RJL_36 = '|'.join(
                    re.findall('容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 计容建筑面积(m2)
                JRZJMJ_41 = '|'.join(
                    re.findall(f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)',
                               items))
                # 出让方式
                CRFS_42 = '|'.join(
                    re.findall(
                        '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 出让年限
                CRNX_43 = '|'.join(
                    re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)', items))
                # 成交价(万元) / 成交价
                CJJ_44_ = '|'.join(
                    re.findall(
                        f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)',
                        items))
                CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join(
                    re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items))
                # 受让单位
                SRDW_45 = '|'.join(
                    re.findall('受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 土地使用条件
                TDSYTJ_46 = '|'.join(
                    re.findall(
                        '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                        items))
                # 交易时间
                JYSJ_47 = '|'.join(
                    re.findall('交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 成交人
                CJR_48 = '|'.join(
                    re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items))
                # 备注
                BZ_49 = '|'.join(
                    re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items))
                # 联系地址
                LXDZ_51 = '|'.join(
                    re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)',
                               items))
                # 联系电话
                LXDH_52 = '|'.join(
                    re.findall('联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                               items))
                # 公示期
                GSQ_53 = '|'.join(
                    re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)',
                               items))
            else:
                if not table:
                    # TODO 正则匹配的页面
                    # 公告序号
                    GGXH_31 = '|'.join(
                        re.findall('公告序号(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                                   items))
                    # 宗地编号 / 编号
                    ZDBH_32_ = '|'.join(
                        re.findall(
                            f'[宗地](?:[\s]*)编号(?:[\s]*)([{self.reStr}]*)(?:\s)',
                            items))
                    ZDBH_32 = ZDBH_32_.replace(':',
                                               '') if ZDBH_32_ else ZDBH_32_
                    # 地块位置
                    DKWZ_33_ = '|'.join(
                        re.findall(f'地块位置(?:[\s]*)({self.reStr})(?:\n)',
                                   items))
                    DKWZ_33 = DKWZ_33_ if DKWZ_33_ else '|'.join(
                        re.findall(
                            '土地(?:[\s]*)坐落(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 土地用途 / 用途
                    TDYT_34 = '|'.join(
                        re.findall(
                            '[土地]?用途(?:[\s]*)([()\w\.:\-\/\%,、]*)(?:\n)',
                            items))
                    # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                    TDMJ_35_ = '|'.join(
                        re.findall(
                            '土地面积\(m2\)(?:[\s]*)([()\w\.:\-\/\%,、\.]*)(?:\n)',
                            items))
                    TDMJ_35 = TDMJ_35_ if TDMJ_35_ else '|'.join(
                        re.findall(
                            f'土地面积(?:\s*)[\((]*平方米[\))]*(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    # 容积率
                    RJL_36 = '|'.join(
                        re.findall(
                            '容积率(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 计容建筑面积(m2)
                    JRZJMJ_41 = '|'.join(
                        re.findall(
                            f'计容建筑面积\(m2\)(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    # 出让方式
                    CRFS_42 = '|'.join(
                        re.findall(
                            '出让方式[:]*(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 出让年限
                    CRNX_43 = '|'.join(
                        re.findall(f'出让年限(?:[\s]*)({self.reStr})(?:\n)',
                                   items))
                    # 成交价(万元) / 成交价
                    CJJ_44_ = '|'.join(
                        re.findall(
                            f'成交价(?:[\s]*)[\((]*万元[)\)]*(?:[\s]*)({self.reStr})(?:\n)',
                            items))
                    CJJ_44 = CJJ_44_ if CJJ_44_ else '|'.join(
                        re.findall(f'成交价:(?:[\s]*)({self.reStr})(?:三)', items))
                    # 受让单位
                    SRDW_45 = '|'.join(
                        re.findall(
                            '受让单位(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 土地使用条件
                    TDSYTJ_46 = '|'.join(
                        re.findall(
                            '土地(?:[\s]*)使用(?:[\s]*)条件(?:[\s]*)([()\w\.:: ≤;≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 交易时间
                    JYSJ_47 = '|'.join(
                        re.findall(
                            '交易时间(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 成交人
                    CJR_48 = '|'.join(
                        re.findall(f'成交人:(?:[\s]*)({self.reStr})(?:二)', items))
                    # 备注
                    BZ_49 = '|'.join(
                        re.findall(f'备注:(?:[\s]*)({self.reStr})(?:\n)', items))
                    # 联系地址
                    LXDZ_51 = '|'.join(
                        re.findall(f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)',
                                   items))
                    # 联系电话
                    LXDH_52 = '|'.join(
                        re.findall(
                            '联系电话:(?:[\s]*)([()\w\.:≤≥\-\/\%,、\.]*)(?:\n)',
                            items))
                    # 公示期
                    GSQ_53 = '|'.join(
                        re.findall(f'公示时间:(?:[\s]*)([{self.reStr}]*)(?:\n)',
                                   items))
                else:
                    htmlTable = htmlTableTransformer()
                    tdData = htmlTable.tableTrTdRegulation(table)
                    # 宗地编号 / 编号
                    ZDBH_32 = tdData.get('编号') if tdData.get(
                        '编号') else tdData.get('地块编号')
                    # 地块位置
                    DKWZ_33 = tdData.get('地块位置')
                    # 土地用途 / 用途
                    TDYT_34 = tdData.get('用途') if tdData.get(
                        '用途') else tdData.get('土地用途')
                    # 土地面积(平方米) / 土地面积(m2) / 出让面积(m)
                    TDMJ_35_ = tdData.get('地块面积(平方米)') if tdData.get(
                        '地块面积(平方米)') else tdData.get('地块面积(㎡)')
                    TDMJ_35 = TDMJ_35_ if TDMJ_35_ else tdData.get('宗地面积(平方米)')
                    # 出让方式
                    CRFS_42 = tdData.get('出让方式')
                    # 容积率
                    RJL_36 = tdData.get('容积率')
                    # 建筑密度( %)
                    JZMD_37 = tdData.get('建筑密度(%)')
                    # 绿地率( %)
                    LDL_38 = tdData.get('绿地率(%)')
                    # 底价(万元)
                    DJ_40 = tdData.get('底价(万元)')
                    # 保证金(万元)
                    BZJ_39 = tdData.get('保证金(万元)')
                    # 出让年限
                    CRNX_43 = tdData.get('出让年限')
                    # 成交价(万元) / 成交价
                    CJJ_44 = tdData.get('成交价(万元)') if tdData.get(
                        '成交价(万元)') else tdData.get('成交价格(万元)')
                    # 成交人
                    CJR_48 = tdData.get('成交人')
                    # 备注
                    BZ_49 = tdData.get('备注')
                    # 公示期
                    GSQ_53 = reFunction(
                        f'公示期:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系单位
                    LXDW_50 = reFunction(
                        f'联 系 人:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系地址
                    LXDZ_51 = reFunction(
                        f'联系地址:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)
                    # 联系电话
                    LXDH_52 = reFunction(
                        f'联系电话:(?:[\s]*)([{self.reStr}]*)(?:\s)', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = url if url else response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + ZDBH_32 + DKWZ_33)

            # 存储数据
            csvFile = [
                WJBT_27,
                XXSJ_28,
                ZWBT_29,
                GGXH_31,
                ZDBH_32,
                DKWZ_33,
                TDYT_34,
                TDMJ_35,
                RJL_36,
                JZMD_37,
                LDL_38,
                BZJ_39,
                DJ_40,
                JRZJMJ_41,
                CRFS_42,
                CRNX_43,
                CJJ_44,
                SRDW_45,
                TDSYTJ_46,
                JYSJ_47,
                CJR_48,
                BZ_49,
                LXDW_50,
                LXDZ_51,
                LXDH_52,
                GSQ_53,
                crawlingTime,
                url,
                md5Mark,
            ]
            self.fileDetail.write(','.join([
                _.replace(',', ' ').replace('\n', '').replace('\r', '')
                if _ else _ for _ in csvFile
            ]))
            self.fileDetail.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #11
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '')
            # TODO 共有字段
            # 标题
            BT_1 = ''.join(data.xpath('//*[@id="ctl00_ContentPlaceHolder1_UpdatePanel2"]/div/span/text()').extract())
            # 公告编号
            GGBH_2 = reFunction(f'公告编号:\s*([{self.reStr}]*)\s',items)
            # 地块编号
            DKBH_3 = reFunction(f'地块编号:\s*([{self.reStr}]*)\s',items)
            # 地块名称
            DKMC_4 = reFunction(f'地块名称:\s*([{self.reStr}]*)\s',items)
            # 容积率
            RJL_5 = reFunction(f'容积率:\s*([{self.reStr}]*)\s',items)
            # 用地性质
            YDXZ_6 = reFunction(f'用地性质:\s*([{self.reStr}]*)\s',items)
            # 规划面积
            GHMJ_7 = reFunction(f'规划面积:\s*([{self.reStr}]*)\s',items)
            # 实际岀让面积
            SJCRMJ_8 = reFunction(f'实际出让面积:\s*([{self.reStr}]*)\s',items)
            # 公告发布时间
            GGFBSJ_9 = reFunction(f'公告发布时间:\s*([{self.reStr}]*)\s',items)
            # 保证金金额
            BZJJE_10 = reFunction(f'保证金金额:\s*([{self.reStr}]*)\s',items)
            # 挂牌起始价
            GPQSJ_11 = reFunction(f'挂牌起始价:\s*([{self.reStr}]*)\s',items)
            # 竟争保障房建设资金起始价
            JZBZ_12 = reFunction(f'竞争保障房建设资金起始价:\s*([{self.reStr}]*)\s',items)
            # 最高限价
            ZGXJ_13 = reFunction(f'最高限价:\s*([{self.reStr}]*)\s',items)
            # 加价幅度
            JJFD_14 = reFunction(f'加价幅度:\s*([{self.reStr}]*)\s',items)
            # 报名开始时时间
            BMKS_15 = reFunction(f'报名开始时间:\s*([{self.reStr}]*)\s',items)
            # 报名截至时间
            BMJZ_16 = reFunction(f'报名截至时间:\s*([{self.reStr}]*)\s',items)
            # 报价截至时间
            BJJZ_17 = reFunction(f'报价截至时间:\s*([{self.reStr}]*)\s',items)
            # 保证金截至时间
            BZJJZ_18 = reFunction(f'保证金截至时间:\s*([{self.reStr}]*)\s',items)
            # 限时竟价开始时间
            ZSJJKS_19 = reFunction(f'限时竞价开始时间:\s*([{self.reStr}]*)\s',items)
            # 最新报价
            ZXBJ_20 = reFunction(f'最新报价:\s*([{self.reStr}]*)\s',items)
            # 最近报价时间
            ZXBJ_21 = reFunction(f'最新报价时间:\s*([{self.reStr}]*)\s',items)
            # 竟得者
            JDZ_22 = reFunction(f'竞得者:\s*([{self.reStr}]*)\s',items)
            # 竟得价
            ZDJ_23 = reFunction(f'竞得价:\s*([{self.reStr}]*)\s',items)
            # 报价轮次
            BJLC_24 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[1]/text()').extract_first()
            # 报价人
            BJR_25 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[2]/span/text()').extract_first()
            # 金额报价
            JEBJ_26 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[3]/span/text()').extract_first()
            # 单位地价
            DWDJ_27 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[4]/span/text()').extract_first()
            # 报价时间
            BJSJ_28 = data.xpath('//*[@id="ctl00_ContentPlaceHolder1_GVLandPrice"]/tr[2]/td[5]/text()').extract_first()

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + BT_1 + GGBH_2)

            # 是否需要判断重复 请求
            if DUPLICATE_SWITCH:
                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                    self.duplicateUrl += 1

            if self.duplicateUrl < 50:
                # 重复效验通过, 存储数据
                csvFile = [
                    BT_1,
                    GGBH_2,
                    DKBH_3,
                    DKMC_4,
                    RJL_5,
                    YDXZ_6,
                    GHMJ_7,
                    SJCRMJ_8,
                    GGFBSJ_9,
                    BZJJE_10,
                    GPQSJ_11,
                    JZBZ_12,
                    ZGXJ_13,
                    JJFD_14,
                    BMKS_15,
                    BMJZ_16,
                    BJJZ_17,
                    BZJJZ_18,
                    ZSJJKS_19,
                    ZXBJ_20,
                    ZXBJ_21,
                    JDZ_22,
                    ZDJ_23,
                    BJLC_24,
                    BJR_25,
                    JEBJ_26,
                    DWDJ_27,
                    BJSJ_28,
                    crawlingTime,
                    url,
                    md5Mark,
                    ]
                results = ''
                for _ in csvFile:
                    try:
                        if _ and _ != '|' * len(_):
                            results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                        else:
                            results += ','
                    except Exception as e:
                        results += ','
                        self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                 level=logging.ERROR)
                with open(self.pathDetail, 'a+') as fp:
                    fp.write(results)
                    fp.write('\n')
                self.log(f'数据获取成功', level=logging.INFO)
                yield
            else:
                self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
コード例 #12
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            noticeDetail = 'https://www.sz68.com' + data.xpath(
                '//iframe[@id="externalframe1"]/@src').extract_first(
                ) if data.xpath(
                    '//iframe[@id="externalframe1"]/@src').extract_first(
                    ) else 'https://www.sz68.com' + data.xpath(
                        '//iframe[@id="externalframe0"]/@src').extract_first()

            ZWBT = ''
            GGQ = ''
            GPKSSJ = ''
            GPJSSJ = ''
            ZDDM_DKZDBH = ''
            ZDH = ''
            DKWZ = ''
            DKYT = ''
            ZRHYLB = ''
            TDMJ = ''
            JZMJ = ''
            TDSYNX = ''
            TDFZXZ = ''
            RJL = ''
            GPQSJ = ''
            JMBZJ = ''
            TDSYNX = ''
            ZBJJZSJ = ''
            BMSJ = ''
            BMDD = ''
            DZ = ''
            DH = ''
            JYSJ = response.meta.get('JYSJ')
            JYZT = response.meta.get('JYZT')
            ZDH = response.meta.get('ZDH')
            TDWZ = response.meta.get('TDWZ')
            QSJ = response.meta.get('QSJ')
            TDYT = response.meta.get('TDYT')
            TDMJ = response.meta.get('TDMJ')
            JYFS = response.meta.get('JYFS')
            id = response.meta.get('id')
            # 公告详情
            detailData = requests.get(noticeDetail,
                                      headers=self.header,
                                      allow_redirects=False,
                                      timeout=60,
                                      verify=False)

            if detailData.status_code == 200:
                detail = Selector(text=detailData.content.decode('utf-8'))
                items = str(detail.xpath('string(.)').extract()[0]).replace(
                    '\xa0', '').replace('\u3000',
                                        '').replace('\n', '').replace(' ', '')
                # 正文标题
                ZWBT = ''.join(
                    detail.xpath(
                        '/html/body/div/p[2]/span//text() | /html/body/p[2]/span//text()|/html/body/p[1]/span//text()'
                    ).extract())
                # 公告期
                GGQ = reFunction('公告期自([\w \-\s]*)[止]?,', items)
                # 挂牌开始时间
                GPKSSJ = reFunction(
                    '挂牌期自(\d{4}年\d{1,2}月\d{1,2}日)[起]?至(?:\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # 挂牌结束时间
                GPJSSJ = reFunction(
                    '挂牌期自(?:\d{4}年\d{1,2}月\d{1,2}日)[起]?至(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时)止',
                    items)
                # TODO 解析页面表格
                soup = BeautifulSoup(detailData.text)
                table = soup.find('body').find('div').find(
                    'table') if soup.find('body').find('div').find(
                        'table') else soup.find('table')

                htmlTable = htmlTableTransformer()
                tdData = htmlTable.table_tr_td(table)
                # 宗地代码 / 地块宗地编号
                ZDDM_DKZDBH = tdData.get('宗地编号') if tdData.get(
                    '宗地编号') else tdData.get('地块宗地编号')
                # 宗地号
                ZDH_A = tdData.get('宗地号')
                # 土地位置
                DKWZ = tdData.get('土地位置')
                # 土地用途
                DKYT = tdData.get('土地用途')
                # 准入行业类别
                ZRHYLB = tdData.get('准入行业类别')
                # 土地面积 / 土地面积(平方米)
                TDMJ_A = tdData.get('土地面积(平方米)') if tdData.get(
                    '土地面积(平方米)') else tdData.get('土地面积')
                # 建筑面积(平方米) / 总建筑面积
                JZMJ = tdData.get('建筑面积(平方米)') if tdData.get(
                    '建筑面积(平方米)') else tdData.get('总建筑面积')
                # 挂牌起始价(人民币万元)
                GPQSJ = tdData.get('挂牌起始价(人民币、万元)')
                # 竞买(投标)保证金(人民币万元)
                JMBZJ = tdData.get('竞买(投标)保证金(人民币、万元)')
                # 土地使用年限(年)
                TDSYNX = tdData.get('土地使用年期')

                if not detail.xpath('//table').extract():
                    # 宗地代码 / 地块宗地编号
                    ZDDM_DKZDBH = reFunction('宗地编号([\w \-\s]*),', items)
                    # 土地使用年期 / 土地使用年限  情况2 中的 土地使用年期
                    TDSYNX = reFunction('土地使用年[\s期限]*[为]?(\d*年)', items)
                    # 土地发展建设现状
                    TDFZXZ = reFunction('土地的发展建设现状:([\S\s]*。)', items)
                    # 容积率  容积率不大于1.518。
                    RJL = reFunction('容积率[\D]*([\.\d]*)。', items)
                    # 土地位置  宗地位于龙岗 中心城14号地,
                    DKWZ = reFunction('宗地位于([\w \s]*),', items)
                    # 土地用途
                    DKYT = reFunction('土地用途为([\w \s]*),', items)
                    # TODO 是否需要在解析一种页面  http://localhost:63342/IntegrationSpider/Logs/dwsw.html?_ijt=rfnsd28r0fb132e6i5qkd3db6f
                # 保证金截止时间
                ZBJJZSJ = reFunction(
                    '保证金的到账截止时间为(\d{4}年\d{1,2}月\d{1,2}日\d{1,2}时\d{1,2}分)',
                    items)
                # 地址  //匹配这些中文标点符号 。 ? ! , 、 ; :

                DZ = '|'.join(
                    re.findall('地址:([\w \.\-\s\/\%,\(\)。 \? \!  、:]*);咨询电话',
                               items))
                # 电话
                DH = '|'.join(
                    re.findall('咨询电话:([\w \.\-\s\/\%,\(\)。 \? \!  、]*)[;。]',
                               items))
            else:
                raise IntegrationException(f'获取公告详情失败, url: {noticeDetail}')

            # TODO 基本信息  完成
            itemsData = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 交易方式
            JYFS_A = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[2]/span/text()'
            ).extract_first()
            # 交易类型
            JYLX = data.xpath(
                '//div[@class="content_case1"]/div[1]/ul/li[1]/span/text()'
            ).extract_first()
            # 宗地
            ZD = data.xpath('//div[@class="content_case1"]/div[1]/div/text()'
                            ).extract_first()
            # 发布时间
            FBSJ = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[2]/text()'
            ).extract_first()
            # 交易状态
            JYZT_A = data.xpath(
                '//div[@class="content_case1"]/div[2]/span[3]/text()'
            ).extract_first()
            # 中标人 / 竞得人
            ZBR_24 = data.xpath(
                '//div[@class="right_first"]/div[1]/div[2]/text()'
            ).extract_first()
            # 成交价(元)
            CJJ_25 = data.xpath(
                '//div[@class="right_first"]/div[2]/div[2]/text()'
            ).extract_first()
            # 保证金(元)
            BZJ_26 = data.xpath(
                '//div[@class="right_first twin"][1]/div[1]/div[2]/text()'
            ).extract_first()
            # 起始价(元)
            QSJ_A = data.xpath(
                '//div[@class="right_first twin"][1]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞价阶梯(元)
            JJJT_28 = data.xpath(
                '//div[@class="right_first twin"][2]/div[1]/div[2]/text()'
            ).extract_first()
            # 封顶价(元)
            FDJ_29 = data.xpath(
                '//div[@class="right_first twin"][2]/div[2]/div[2]/text()'
            ).extract_first()
            # 竞买申请截止时间
            JMSQJZSJ_30 = data.xpath(
                '//div[@class="right_first twin"][3]/div[1]/div[2]/text()'
            ).extract_first()
            # 竞买人数
            JMRS_31 = data.xpath(
                '//div[@class="right_first twin"][3]/div[2]/div[2]/text()'
            ).extract_first()

            # TODO 标的详情  完成
            BDdetail = data.xpath(
                '//li[@class="weather_info_ul_item"]/div[2]/span')
            # 宗地号
            ZDH_B = BDdetail[0].xpath('text()').extract_first()
            # 土地面积
            TDMJ_B = BDdetail[1].xpath('text()').extract_first()
            # 建筑面积
            JZMJ_A = BDdetail[2].xpath('text()').extract_first()
            # 容积率
            RJL_A = BDdetail[3].xpath('text()').extract_first()
            # 建筑覆盖率
            JZFGL = BDdetail[4].xpath('text()').extract_first()
            # 建筑高度
            JZGD = BDdetail[5].xpath('text()').extract_first()
            # 用途
            YT = BDdetail[6].xpath('text()').extract_first()
            # 使用年限
            SYNX = BDdetail[7].xpath('text()').extract_first()
            # 区域
            QY = BDdetail[8].xpath('text()').extract_first()
            # 位置
            WZ = BDdetail[9].xpath('text()').extract_first()
            # 绿地率
            LDL = BDdetail[10].xpath('text()').extract_first()
            # 建筑楼层
            JZLC = BDdetail[11].xpath('text()').extract_first()

            # TODO 竞价记录 完成
            # 竞买人
            JMR = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[2]/text()'
            ).extract_first()
            # 竞买出价(元)
            JMSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[3]/text()'
            ).extract_first()
            # 竞价时间
            CJSJ = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[4]/text()'
            ).extract_first()
            # 状态
            ZT = data.xpath(
                '//div[@class="conomy"][1]/table/tr[2]/td[5]/text()'
            ).extract_first()

            # TODO 结果公示 完成
            results = requests.post(
                'https://www.sz68.com/tiaim/web/resultdetailbytargetId',
                headers=self.header,
                data={'targetId': id},
                allow_redirects=False,
                timeout=60,
                verify=False)
            if results.status_code == 200:
                resultsData = results.json()
                # 正文标题
                ZWBT_A = resultsData.get('notice').get('NAME')
                # 发布日期
                FBRQ = resultsData.get('notice').get('PUBLISH_TIME')
                # 宗地号
                ZDH_C = resultsData.get('notice').get('DTL_REF_NO')
                # 竞得人
                JDR = reFunction('竞得人:([\w \.\-\s\/\%,]*)<',
                                 resultsData.get('fileExtName'))
                # 中标人
                ZBR_A = reFunction('中标人:([\w \.\-\s\/\%,]*)<',
                                   resultsData.get('fileExtName'))
                # 位置
                WZ = reFunction('位置:([\w \.\-\s\/\%,、]*)<',
                                resultsData.get('fileExtName'))
                # 土地用途
                TDYT_A = reFunction('土地用途:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 土地面积
                TDMJ_C = reFunction('土地面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 建筑面积
                JZMJ_B = reFunction('建筑面积:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))
                # 起始价
                QSJ_D = reFunction('起始价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 成交价
                CJJ_A = reFunction('成交价:([\w \.\-\s\/\%,、]*)<',
                                   resultsData.get('fileExtName'))
                # 溢价率
                YJL = reFunction('溢价率:([\w \.\-\s\/\%,、]*)<',
                                 resultsData.get('fileExtName'))
                # 综合楼面单价
                ZHLMDJ = reFunction('综合楼面单价:([\w \.\-\s\/\%,、]*)<',
                                    resultsData.get('fileExtName'))

            # TODO  附件  解析出让合同  完成
            accessory = '土地模块|'
            links = data.xpath('//div[@class="accessory_link"]/a')
            for link in links:
                fileName = link.xpath(
                    'text()[position()=((position() mod 2)=0)]'
                ).extract_first().strip() if link.xpath(
                    'text()[position()=((position() mod 2)=0)]').extract_first(
                    ) else '未知名称'
                try:
                    href = link.xpath('@href').extract_first()
                    linkPath = self.dirName + f'土地模块_{ZDH}' + fileName
                    response = requests.get(href,
                                            headers=self.header,
                                            timeout=200)

                    with open(linkPath, 'wb') as fp:
                        fp.write(response.content)
                except:
                    pass
                else:
                    accessory += fileName + '|'
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            md5Mark = encrypt_md5(ZDH + WZ + ZWBT + url)
            csvFile = [
                JYSJ,
                JYZT,
                ZDH,
                TDWZ,
                QSJ,
                TDYT,
                TDMJ,
                JYFS_A,
                JYLX,
                ZD,
                FBSJ,
                JYZT_A,
                ZBR_24,
                CJJ_25,
                BZJ_26,
                QSJ_A,
                JJJT_28,
                FDJ_29,
                JMSQJZSJ_30,
                JMRS_31,
                ZWBT,
                GGQ,
                GPKSSJ,
                GPJSSJ,
                ZDDM_DKZDBH,
                ZDH_A,
                DKWZ,
                DKYT,
                ZRHYLB,
                TDMJ_A,
                JZMJ,
                TDSYNX,
                TDFZXZ,
                RJL,
                GPQSJ,
                JMBZJ,
                TDSYNX,
                ZBJJZSJ,
                DZ,
                DH,
                ZDH_B,
                TDMJ_B,
                JZMJ_B,
                RJL_A,
                JZFGL,
                JZGD,
                YT,
                SYNX,
                QY,
                WZ,
                LDL,
                JZLC,
                JMR,
                JMSJ,
                CJSJ,
                ZT,
                ZWBT_A,
                FBRQ,
                ZDH_C,
                JDR,
                ZBR_A,
                WZ,
                TDYT_A,
                TDMJ_C,
                JZMJ_B,
                QSJ_D,
                CJJ_A,
                YJL,
                ZHLMDJ,
                crawlingTime,
                url,
                md5Mark,
                accessory,
            ]
            fileData = []
            for _ in csvFile:
                try:
                    fileData.append(
                        _.replace(',', ' ').replace('\n',
                                                    '').replace('\r', ''))
                except:
                    fileData.append(str(_))
            self.fileDetail.write(','.join(fileData))
            self.fileDetail.write('\n')
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #13
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '')
            htmlTable = htmlTableTransformer()
            WJBT_1 = ''
            XXSJ_2 = ''
            WBT_3 = ''
            GGBH_4 = ''
            CRSJ_5 = ''
            GGNX_6 = ''
            ZDBH_7 = ''
            DKWZ_8 = ''
            ZDWZ_9 = ''
            ZDZL_10 = ''
            TDYT_11 = ''
            GHTDYT_12 = ''
            CRNX_13 = ''
            SYNX_14 = ''
            PZJGJWH_15 = ''
            GHYDMJ_16 = ''
            GHMJ_17 = ''
            CRMJ_18 = ''
            CRYDMJ_19 = ''
            ZDCRMJ_20 = ''
            JZMD_21 = ''
            RJL_22 = ''
            LDL_23 = ''
            LDL_24 = ''
            JZKZGD_25 = ''
            JZKZZGD_26 = ''
            JZXS_27 = ''
            TZQD_28 = ''
            TDGJBAH_29 = ''
            SFSZD_30 = ''
            TDXZTJ_31 = ''
            JMBZJ_32 = ''
            JMBZJ_72 = ''
            QJJ_33 = ''
            CRQSJ_34 = ''
            JJFD_35 = ''
            SFSZBLJ_36 = ''
            GPKSSJ_37 = ''
            GPJZSJ_38 = ''
            HQCRWJSJ_39 = ''
            TJJMSQSJ_40 = ''
            BZJJZSJ_41 = ''
            QRJMZGSJ_42 = ''
            LXDZ_43 = ''
            LXDH_44 = ''
            LXR_45 = ''
            BZJZH_86 = ''
            BZJZH_87 = ''
            BZJZH_88 = ''
            CRJZH_97 = ''
            CRJZH_98 = ''
            CRJZH_99 = ''

            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_1 = response.meta.get('title').strip()
            # 信息时间
            XXSJ_2 = reFunction('[\d\-]*', data.xpath('//p[@class="sub-cp"]/text()').extract_first())
            # 正文标题
            WBT_3 = WJBT_1
            # 公告编号
            GGBH_4 = ''.join(data.xpath('//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()').extract())
            # 出让时间
            CRSJ_5 = reFunction('定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items)
            # 公告类型
            GGNX_6 = '出让公告'
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_1 + XXSJ_2)

            GPSJ_0 = reFunction('挂牌交易期限:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items)
            GPSJ_1 = reFunction('申请人可于:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)到', items)
            GPSJ = GPSJ_0 if GPSJ_0 else GPSJ_1
            # 挂牌开始时间、
            GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 挂牌截止时间、
            GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            if GPSJ:
                try:
                    GPKSSJ_37 = GPSJ.split('至')[0]
                    GPJZSJ_38 = GPSJ.split('至')[1]
                except:
                    pass
            # 获取出让文件时间、
            HQCRWJSJ_39 = GPSJ_1
            # 提交竞买申请时间、
            TJJMSQSJ_40 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 保证金截止时间、
            BZJJZSJ_41 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 确认竞买资格时间
            QRJMZGSJ_42 = reFunction('(\d{4}年\d{1,3}月\d{1,3}日(?:[()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)至 \d{4}年\d{1,3}月\d{1,3}日)', items)
            # 联系地址、
            LXDZ_43 = reFunction('联系地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话、
            LXDH_44 = reFunction('联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人、
            LXR_45 = reFunction('联\s*系\s*人:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

            ZH_0 = reFunction('以下账户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪\s]*)[一二三四五六七八九123456789]*', items)
            ZH_1 = reFunction('保证金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items)
            try:
                if ZH_0:
                    if ZH_0[:2] == '户名':
                        result = re.split('[①②③④]*', ZH_0)
                        # 保证金账户开户单位 / 户名
                        BZJZH_86 = result[0].replace('户名:','') if result[0] else ''
                        # 保证金账户账号
                        BZJZH_87 = '|'.join([re.split(',|,', _)[0] for _ in result[1:]])
                        # 保证金账户开户行
                        BZJZH_88 = '|'.join([re.split(',|,', _)[-1] for _ in result[1:]])
                    else:
                        result = re.split('[①②③④]*', ZH_0)
                        # 保证金账户开户单位 / 户名
                        BZJZH_86 = '|'.join([re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                        # 保证金账户账号
                        BZJZH_87 = '|'.join([re.findall('户\s*名:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('开 户 行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                        # 保证金账户开户行
                        BZJZH_88 = '|'.join([re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _)[0] if re.findall('账\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s', _) else '' for _ in result])
                elif ZH_1:
                    # 保证金账户开户单位 / 户名
                    BZJZH_86 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
                    # 保证金账户账号
                    BZJZH_87 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
                    # 保证金账户开户行
                    BZJZH_88 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', ZH_1)).replace(';')
            except:
                pass
            CR = reFunction('出让金帐户:*\s*([\w\.:: —\(\)\s〔〕㎡㎡≤≥《》\-\/\%,;;,、\.﹪]*)\s*', items)
            try:
                # 出让金账户开户单位 / 户名
                CRJZH_97 = '|'.join(re.findall('开户[单位名称]*:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
                # 出让金账户开户行
                CRJZH_98 = '|'.join(re.findall('开\s*户\s*行:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
                # 出让金账户账号
                CRJZH_99 = '|'.join(re.findall('帐\s*号:*\s*([\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,,;;、\.﹪]*)\s', CR)).replace(';')
            except:
                pass

            if '拍卖出让地块的基本情况和规划指标要求' not in items and '备注' not in items and '挂牌出让地块的基本情况和规划指标要求' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    tables = soup.find_all('table')
                    if '规划用途及主要指标' in items:  # 处理费标准的表格
                        soup = BeautifulSoup(response.body.decode('utf-8'))
                        table = soup.find('table')
                        tdReplace = table.tbody.find_all('tr')[0].find('td', colspan='4')
                        number = table.tbody.find_all('tr')[0].index(tdReplace)
                        tdList = table.tbody.find_all('tr')[1].find_all('td')
                        for _ in range(1, len(tdList) + 1):
                            table.tbody.find_all('tr')[0].insert(number + _, tdList[_ - 1])
                        tdReplace.extract()
                        [_.extract() for _ in table.tbody.find_all('tr')[1].find_all('td')]
                        table.tbody.find_all('tr')[1].extract()

                        tdData = htmlTable.tableTrTdChangeToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_7 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else ''
                            # 出让面积(m2)
                            CRMJ_18 = tdData.get('土地面积(㎡)')[_] if tdData.get('土地面积(㎡)') else ''
                            # 容积率
                            RJL_22 = tdData.get('容积率')[_] if tdData.get('容积率') else ''
                            # 绿地率( %)
                            LDL_24 = tdData.get('绿地率(%)')[_] if tdData.get('绿地率(%)') else ''
                            # 建筑系数( %)
                            JZXS_27 = tdData.get('建筑系数(%)')[_] if tdData.get('建筑系数(%)') else ''
                            # 竟买保证金(万元)
                            JMBZJ_72 = tdData.get('竞买保证金(万元)')[_] if tdData.get('竞买保证金(万元)') else ''
                            # 出让起始价(万元)
                            CRQSJ_34 = tdData.get('挂牌出让起始价(万元)')[_] if tdData.get('挂牌出让起始价(万元)') else ''
                            # 加价幅度、
                            JJFD_35 = tdData.get('加价幅度')[_] if tdData.get('加价幅度') else ''

                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if ZDBH_7:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_1,
                                        XXSJ_2,
                                        WBT_3,
                                        GGBH_4,
                                        CRSJ_5,
                                        GGNX_6,
                                        ZDBH_7,
                                        DKWZ_8,
                                        ZDWZ_9,
                                        ZDZL_10,
                                        TDYT_11,
                                        GHTDYT_12,
                                        CRNX_13,
                                        SYNX_14,
                                        PZJGJWH_15,
                                        GHYDMJ_16,
                                        GHMJ_17,
                                        CRMJ_18,
                                        CRYDMJ_19,
                                        ZDCRMJ_20,
                                        JZMD_21,
                                        RJL_22,
                                        LDL_23,
                                        LDL_24,
                                        JZKZGD_25,
                                        JZKZZGD_26,
                                        JZXS_27,
                                        TZQD_28,
                                        TDGJBAH_29,
                                        SFSZD_30,
                                        TDXZTJ_31,
                                        JMBZJ_32,
                                        JMBZJ_72,
                                        QJJ_33,
                                        CRQSJ_34,
                                        JJFD_35,
                                        SFSZBLJ_36,
                                        GPKSSJ_37,
                                        GPJZSJ_38,
                                        HQCRWJSJ_39,
                                        TJJMSQSJ_40,
                                        BZJJZSJ_41,
                                        QRJMZGSJ_42,
                                        LXDZ_43,
                                        LXDH_44,
                                        LXR_45,
                                        BZJZH_86,
                                        BZJZH_87,
                                        BZJZH_88,
                                        CRJZH_97,
                                        CRJZH_98,
                                        CRJZH_99,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                    '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                     level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(self,
                                                                 'response msg info %s, job duplicated!' % response.url)
                    elif len(tables) <= 3:
                        tdsList = {}
                        for table in tables:
                            td = htmlTable.tableTrTdRegulationToList(table)
                            tdsList.update(td)
                        for _ in range(len(list(tdsList.values())[0])):
                            # 宗地编号
                            ZDBH_7 = tdsList.get('宗地编号')[_] if tdsList.get('宗地编号') else ''
                            # 地块编号  地块名称
                            DKWZ_8 = tdsList.get('地块编号')[_] if tdsList.get('地块编号') else tdsList.get('地块编号')[_] if tdsList.get('地块编号') else ''
                            # 宗地位置
                            ZDWZ_9 = tdsList.get('宗地位置')[_] if tdsList.get('宗地位置') else ''
                            # 宗地坐落
                            ZDZL_10 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else ''
                            # 土地用途
                            TDYT_11 = tdsList.get('土地用途')[_] if tdsList.get('土地用途') else ''
                            # 规划土地用途
                            GHTDYT_12 = tdsList.get('规划土地用途')[_] if tdsList.get('规划土地用途') else ''
                            # 出让年限
                            CRNX_13 = tdsList.get('出让年限')[_] if tdsList.get('出让年限') else ''
                            # 使用年限
                            SYNX_14 = tdsList.get('使用年限')[_] if tdsList.get('使用年限') else ''
                            # 批准机关及文号
                            PZJGJWH_15 = tdsList.get('批准机关及文号')[_] if tdsList.get('批准机关及文号') else tdsList.get('批准文号')[_] if tdsList.get('批准文号') else ''
                            # 规划用地面积〔m2)
                            GHYDMJ_16 = tdsList.get('规划用地面积(m2)')[_] if tdsList.get('规划用地面积(m2)') else tdsList.get('用地面积(㎡)')[_] if tdsList.get('用地面积(㎡)') else tdsList.get('规划用地面积(㎡)')[_] if tdsList.get('规划用地面积(㎡)') else ''
                            # 出让面积(m2)
                            CRMJ_18 = tdsList.get('出让面积(㎡)')[_] if tdsList.get('出让面积(㎡)') else ''
                            # 规划面积(m2)
                            GHMJ_17 = tdsList.get('规划面积(㎡)')[_] if tdsList.get('规划面积(㎡)') else ''
                            # 出让用地面积(m2)
                            CRYDMJ_19 = tdsList.get('出让用地面积(m2)')[_] if tdsList.get('出让用地面积(m2)') else ''
                            # 宗地出让面积
                            ZDCRMJ_20 = tdsList.get('宗地出让面积')[_] if tdsList.get('宗地出让面积') else ''
                            # 建筑密度
                            JZMD_21 = tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else tdsList.get('建筑密度(%)')[_] if tdsList.get('建筑密度(%)') else ''
                            # 容积率
                            RJL_22 = tdsList.get('容积率')[_] if tdsList.get('容积率') else ''
                            # 绿地率
                            LDL_23 = tdsList.get('宗地坐落')[_] if tdsList.get('宗地坐落') else ''
                            # 绿地率( %)
                            LDL_24 = tdsList.get('绿地率')[_] if tdsList.get('绿地率') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else tdsList.get('绿地率(%)')[_] if tdsList.get('绿地率(%)') else ''
                            # 建筑控制高度(m)
                            JZKZGD_25 = tdsList.get('建筑控制高度(m)')[_] if tdsList.get('建筑控制高度(m)') else ''
                            # 建筑控制高度(米)
                            JZKZZGD_26 = tdsList.get('建筑控制高度(米)')[_] if tdsList.get('建筑控制高度(米)') else ''
                            # 投资强度(万元 / 公顷)
                            TZQD_28 = tdsList.get('投资强度(万元/公顷)')[_] if tdsList.get('投资强度(万元/公顷)') else ''
                            # 竞买保证金
                            JMBZJ_32 = tdsList.get('竞买保证金')[_] if tdsList.get('竞买保证金') else ''
                            # 出让起始价(万元)
                            CRQSJ_34 = tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else ''
                            # 竟买保证金(万元)
                            JMBZJ_72 = tdsList.get('竞买保证金(万元)')[_] if tdsList.get('竞买保证金(万元)') else ''
                            # 起叫价
                            QJJ_33 = tdsList.get('起始价')[_] if tdsList.get('起始价') else tdsList.get('出让起始价')[_] if tdsList.get('出让起始价') else ''
                            # 加价幅度
                            JJFD_35 = tdsList.get('加价幅度')[_] if tdsList.get('加价幅度') else ''

                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if ZDBH_7:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_1,
                                        XXSJ_2,
                                        WBT_3,
                                        GGBH_4,
                                        CRSJ_5,
                                        GGNX_6,
                                        ZDBH_7,
                                        DKWZ_8,
                                        ZDWZ_9,
                                        ZDZL_10,
                                        TDYT_11,
                                        GHTDYT_12,
                                        CRNX_13,
                                        SYNX_14,
                                        PZJGJWH_15,
                                        GHYDMJ_16,
                                        GHMJ_17,
                                        CRMJ_18,
                                        CRYDMJ_19,
                                        ZDCRMJ_20,
                                        JZMD_21,
                                        RJL_22,
                                        LDL_23,
                                        LDL_24,
                                        JZKZGD_25,
                                        JZKZZGD_26,
                                        JZXS_27,
                                        TZQD_28,
                                        TDGJBAH_29,
                                        SFSZD_30,
                                        TDXZTJ_31,
                                        JMBZJ_32,
                                        JMBZJ_72,
                                        QJJ_33,
                                        CRQSJ_34,
                                        JJFD_35,
                                        SFSZBLJ_36,
                                        GPKSSJ_37,
                                        GPJZSJ_38,
                                        HQCRWJSJ_39,
                                        TJJMSQSJ_40,
                                        BZJJZSJ_41,
                                        QRJMZGSJ_42,
                                        LXDZ_43,
                                        LXDH_44,
                                        LXR_45,
                                        BZJZH_86,
                                        BZJZH_87,
                                        BZJZH_88,
                                        CRJZH_97,
                                        CRJZH_98,
                                        CRJZH_99,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                    '\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                     level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(self,
                                                                 'response msg info %s, job duplicated!' % response.url)

                    elif len(tables) == 6:
                        # TODO
                        pass
                except:
                    for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]:
                        # 宗地编号
                        ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '')
                        # 宗地坐落
                        ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 土地用途
                        TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 出让年限
                        CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 宗地出让面积
                        ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 建筑密度
                        JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 容积率
                        RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 绿地率( %)
                        LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 建筑控制高度(米)
                        JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 投资强度(万元 / 公顷)
                        TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 土地估价备案号
                        TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 现状土地条件
                        TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 竞买保证金
                        JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 起叫价
                        QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 加价幅度
                        JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                        # 挂牌开始时间、
                        GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                        # 挂牌截止时间、
                        GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                        if self.name in DUPLICATE_SWITCH_LIST:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if ZDBH_7:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    WJBT_1,
                                    XXSJ_2,
                                    WBT_3,
                                    GGBH_4,
                                    CRSJ_5,
                                    GGNX_6,
                                    ZDBH_7,
                                    DKWZ_8,
                                    ZDWZ_9,
                                    ZDZL_10,
                                    TDYT_11,
                                    GHTDYT_12,
                                    CRNX_13,
                                    SYNX_14,
                                    PZJGJWH_15,
                                    GHYDMJ_16,
                                    GHMJ_17,
                                    CRMJ_18,
                                    CRYDMJ_19,
                                    ZDCRMJ_20,
                                    JZMD_21,
                                    RJL_22,
                                    LDL_23,
                                    LDL_24,
                                    JZKZGD_25,
                                    JZKZZGD_26,
                                    JZXS_27,
                                    TZQD_28,
                                    TDGJBAH_29,
                                    SFSZD_30,
                                    TDXZTJ_31,
                                    JMBZJ_32,
                                    JMBZJ_72,
                                    QJJ_33,
                                    CRQSJ_34,
                                    JJFD_35,
                                    SFSZBLJ_36,
                                    GPKSSJ_37,
                                    GPJZSJ_38,
                                    HQCRWJSJ_39,
                                    TJJMSQSJ_40,
                                    BZJJZSJ_41,
                                    QRJMZGSJ_42,
                                    LXDZ_43,
                                    LXDH_44,
                                    LXR_45,
                                    BZJZH_86,
                                    BZJZH_87,
                                    BZJZH_88,
                                    CRJZH_97,
                                    CRJZH_98,
                                    CRJZH_99,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace(
                                                '\r',
                                                '').replace(
                                                r'\xa0', '').replace('\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                 level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                        else:
                            self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)
            else:
                for item in ['宗地编号' + _ for _ in re.findall('一([\s\S]*)二、', items)[0].split('宗地编号')[1:]]:
                    # 宗地编号
                    ZDBH_7 = reFunction('宗地编号:*\s*([\w\-]*)\s', item).replace('宗地位置', '').replace('地块名称', '')
                    # 宗地坐落
                    ZDZL_10 = reFunction('宗地坐落:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 土地用途
                    TDYT_11 = reFunction('土地用途:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 出让年限
                    CRNX_13 = reFunction('出让年限:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 宗地出让面积
                    ZDCRMJ_20 = reFunction('宗地\s*面积:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 建筑密度
                    JZMD_21 = reFunction('建筑密度\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 容积率
                    RJL_22 = reFunction('容积率:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 绿地率( %)
                    LDL_24 = reFunction('绿化率\(%\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 建筑控制高度(米)
                    JZKZZGD_26 = reFunction('建筑限高\(米\):*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 投资强度(万元 / 公顷)
                    TZQD_28 = reFunction('投资强度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 土地估价备案号
                    TDGJBAH_29 = reFunction('土地估价备案号:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 现状土地条件
                    TDXZTJ_31 = reFunction('土地现状:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 竞买保证金
                    JMBZJ_32 = reFunction('保证金:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 起叫价
                    QJJ_33 = reFunction('起始价:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 加价幅度
                    JJFD_35 = reFunction('加价幅度:*\s*([()\w\.::—\(\)〔〕㎡㎡≤≥<《》\-\/\%,;,、\.﹪]*)\s', item)
                    # 挂牌开始时间、
                    GPKSSJ_37 = reFunction('挂牌开始时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                    # 挂牌截止时间、
                    GPJZSJ_38 = reFunction('挂牌截止时间:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if ZDBH_7:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_1,
                                XXSJ_2,
                                WBT_3,
                                GGBH_4,
                                CRSJ_5,
                                GGNX_6,
                                ZDBH_7,
                                DKWZ_8,
                                ZDWZ_9,
                                ZDZL_10,
                                TDYT_11,
                                GHTDYT_12,
                                CRNX_13,
                                SYNX_14,
                                PZJGJWH_15,
                                GHYDMJ_16,
                                GHMJ_17,
                                CRMJ_18,
                                CRYDMJ_19,
                                ZDCRMJ_20,
                                JZMD_21,
                                RJL_22,
                                LDL_23,
                                LDL_24,
                                JZKZGD_25,
                                JZKZZGD_26,
                                JZXS_27,
                                TZQD_28,
                                TDGJBAH_29,
                                SFSZD_30,
                                TDXZTJ_31,
                                JMBZJ_32,
                                JMBZJ_72,
                                QJJ_33,
                                CRQSJ_34,
                                JJFD_35,
                                SFSZBLJ_36,
                                GPKSSJ_37,
                                GPJZSJ_38,
                                HQCRWJSJ_39,
                                TJJMSQSJ_40,
                                BZJJZSJ_41,
                                QRJMZGSJ_42,
                                LXDZ_43,
                                LXDH_44,
                                LXR_45,
                                BZJZH_86,
                                BZJZH_87,
                                BZJZH_88,
                                CRJZH_97,
                                CRJZH_98,
                                CRJZH_99,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r',
                                                                                                                   '').replace(
                                            r'\xa0', '').replace('\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                             level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)
コード例 #14
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            htmlTable = htmlTableTransformer()
            WJBT_48 = ''
            XXSJ_49 = ''
            ZWBT_50 = ''
            GGBH_51 = ''
            CRSJ_52 = ''
            GGNX_53 = ''
            DKBH_54 = ''
            DKWZ_55 = ''
            TDYT_56 = ''
            TDMJ_57 = ''
            CRNX_58 = ''
            CJJ_59 = ''
            SRDW_60 = ''
            TDXZTJ_61 = ''
            TDSYTJ_62 = ''
            BZ_63 = ''
            GSQ_64 = ''
            LXFS_65 = ''
            DWDZ_66 = ''
            YZBM_67 = ''
            LXDH_68 = ''
            LXR_69 = ''
            LXDW_77 = ''
            DZYJ_70 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_48 = response.meta.get('title').strip()
            # 信息时间
            XXSJ_49 = reFunction(
                '[\d\-]*',
                data.xpath('//p[@class="sub-cp"]/text()').extract_first())
            # 正文标题
            ZWBT_50 = WJBT_48
            # 公告编号
            GGBH_51 = ''.join(
                data.xpath(
                    '//div[@class="substance"]/p[position() <5]/.//*[contains(text(),"号")]/ancestor::p/.//text()'
                ).extract())
            # 出让时间
            CRSJ_52 = reFunction(
                '定于\s*([()【】\w\.—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)[,;,、在]', items)
            # 公告类型
            GGNX_53 = '出让结果'

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_48 + XXSJ_49)

            # 公示期
            GSQ_64 = reFunction(
                '公示期:*\s*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)[\s。]', items)
            # 联系方式
            # LXFS_65
            # 联系单位
            LXDW_77 = reFunction(
                '联系单位:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_66 = reFunction(
                '单位地址:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_67 = reFunction(
                '邮政编码:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_68 = reFunction(
                '联系电话:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系人
            LXR_69 = reFunction(
                '联\s*系\s*人:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 电子邮件
            DZYJ_70 = reFunction(
                '电子邮件:*([()\w\.:: —\(\)〔〕㎡㎡≤≥《》@\-\/\%,;,、\.﹪]*)\s', items)

            if '宗地编号' in items or '土地位置' in items:
                soup = BeautifulSoup(response.body.decode('utf-8'))
                table = soup.find('table')
                tdData = htmlTable.tableTrTdRegulationToList(table)
                for _ in range(len(list(tdData.values())[0])):
                    # 地块编号
                    DKBH_54 = tdData.get('宗地编号')[_] if tdData.get(
                        '宗地编号') else ''
                    # 地块位置
                    DKWZ_55 = tdData.get('宗地位置')[_] if tdData.get(
                        '宗地位置') else tdData.get('土地位置')[_] if tdData.get(
                            '土地位置') else ''
                    # 土地用途
                    TDYT_56 = tdData.get('土地用途')[_] if tdData.get(
                        '土地用途') else tdData.get('规划土地用途')[_] if tdData.get(
                            '规划土地用途') else ''
                    # 土地面积(公顷)
                    TDMJ_57 = tdData.get('土地面积(m2)')[_] if tdData.get(
                        '土地面积(m2)') else tdData.get(
                            '出让土地面积(㎡)')[_] if tdData.get('出让土地面积(㎡)') else ''
                    # 出让年限
                    CRNX_58 = tdData.get('使用年限')[_] if tdData.get(
                        '使用年限') else tdData.get('出让年限')[_] if tdData.get(
                            '出让年限') else ''
                    # 成交价(万元)
                    CJJ_59 = tdData.get('成交价(万元)')[_] if tdData.get(
                        '成交价(万元)') else tdData.get(
                            '成交价(人民币)')[_] if tdData.get('成交价(人民币)') else ''
                    # 受让单位
                    SRDW_60 = tdData.get('受让单位')[_] if tdData.get(
                        '受让单位') else tdData.get('竞买人(单位)')[_] if tdData.get(
                            '竞买人(单位)') else ''
                    # 土地使用条件
                    TDSYTJ_62 = tdData.get('土地使用条件')[_] if tdData.get(
                        '土地使用条件') else ''

                    # 数据写入
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if DKWZ_55:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_48,
                                XXSJ_49,
                                ZWBT_50,
                                GGBH_51,
                                CRSJ_52,
                                GGNX_53,
                                DKBH_54,
                                DKWZ_55,
                                TDYT_56,
                                TDMJ_57,
                                CRNX_58,
                                CJJ_59,
                                SRDW_60,
                                TDXZTJ_61,
                                TDSYTJ_62,
                                BZ_63,
                                GSQ_64,
                                LXFS_65,
                                DWDZ_66,
                                YZBM_67,
                                LXDH_68,
                                LXR_69,
                                LXDW_77,
                                DZYJ_70,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '地块编号' in items:
                for item in [
                        '地块编号' + _ for _ in re.findall('一([\s\S]*)二、', items)
                    [0].split('地块编号')[1:]
                ]:
                    # 地块编号
                    DKBH_54 = reFunction('地块编号:*\s*([\w\-]*)\s', item)
                    # 地块位置
                    DKWZ_55 = reFunction(
                        '地块位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地用途
                    TDYT_56 = reFunction(
                        '土地用途:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地面积(公顷)
                    TDMJ_57 = reFunction(
                        '土地面积\(公顷\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 出让年限
                    CRNX_58 = reFunction(
                        '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 成交价(万元)
                    CJJ_59 = reFunction(
                        '成交价\(万元\):*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 受让单位
                    SRDW_60 = reFunction(
                        '受让单位:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地现状
                    TDXZTJ_61 = reFunction(
                        '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 土地使用条件
                    TDSYTJ_62 = reFunction(
                        '土地使用条件:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)
                    # 备注
                    BZ_63 = reFunction(
                        '备注:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        item)

                    # 数据写入
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if DKWZ_55:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_48,
                                XXSJ_49,
                                ZWBT_50,
                                GGBH_51,
                                CRSJ_52,
                                GGNX_53,
                                DKBH_54,
                                DKWZ_55,
                                TDYT_56,
                                TDMJ_57,
                                CRNX_58,
                                CJJ_59,
                                SRDW_60,
                                TDXZTJ_61,
                                TDSYTJ_62,
                                BZ_63,
                                GSQ_64,
                                LXFS_65,
                                DWDZ_66,
                                YZBM_67,
                                LXDH_68,
                                LXR_69,
                                LXDW_77,
                                DZYJ_70,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            else:
                # 地块位置
                DKWZ_55 = reFunction(
                    '地理位置:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 出让年限
                CRNX_58 = reFunction(
                    '出让年限:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 成交价(万元)
                CJJ_59 = reFunction(
                    '成交价格(人民币):*\s*([()\w\.::—\¥ (\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 受让单位
                SRDW_60 = reFunction(
                    '竞得人名称:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 土地现状
                TDXZTJ_61 = reFunction(
                    '土地现状:*\s*([()\w\.::—\(\)〔〕㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)

                # 数据写入
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if DKWZ_55:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_48,
                            XXSJ_49,
                            ZWBT_50,
                            GGBH_51,
                            CRSJ_52,
                            GGNX_53,
                            DKBH_54,
                            DKWZ_55,
                            TDYT_56,
                            TDMJ_57,
                            CRNX_58,
                            CJJ_59,
                            SRDW_60,
                            TDXZTJ_61,
                            TDSYTJ_62,
                            BZ_63,
                            GSQ_64,
                            LXFS_65,
                            DWDZ_66,
                            YZBM_67,
                            LXDH_68,
                            LXR_69,
                            LXDW_77,
                            DZYJ_70,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #15
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # TODO 共有字段
            # 文件标题
            WJBT_6 = response.meta.get('title')
            # 发布时间
            FBBT_7 = response.meta.get('ND')
            # 正文标题
            ZWBT_8_ = data.xpath(
                '//*[@id="dl_news"]/tr/td/table[2]/tr/td/p[1]/b/span/text()'
            ).extract_first() if data.xpath(
                '//*[@id="dl_news"]/tr/td/table[2]/tr/td/p[1]/b/span/text()'
            ).extract_first() else data.xpath(
                '//*[@id="dl_news"]/tr/td/table[2]/tr/td/div[1]/b/span/text()'
            ).extract_first()
            ZWBT_8 = ZWBT_8_ if ZWBT_8_ else WJBT_6

            # 公告类型
            GGNX_9_ = reFunction(
                f'公告类型:\s*([{self.reStr}]*)\s', items) if reFunction(
                    f'公告类型:\s*([{self.reStr}]*)\s', items) else WJBT_6[-4:]
            GGNX_9 = GGNX_9_ if GGNX_9_ in ['出让公告', '补充公告', '地块公告', '地块公示'
                                            ] else ''
            # 地块坐落
            DKZL_10_ = reFunction(f'地块坐落于\s*([{self.reStr}]*)。', items)
            DKZL_10 = DKZL_10_ if DKZL_10_ else reFunction(
                f'地块位于\s*([{self.reStr}]*)四至为', items)
            # 四至
            SZ_11 = reFunction(f'四至为:\s*([{self.reStr}]*)。', items)
            # 土地现状
            TDXZ_12_ = reFunction(
                '现状为\s*([()\w\.:: ,,、;, \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items)
            TDXZ_12 = TDXZ_12_ if TDXZ_12_ else reFunction(
                '[,,,、。]现状\s*([()\w\.:: ,,、;, \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。',
                items)
            # 出让土地面积
            CRTDMJ_13 = reFunction(
                f'出让土地面积\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 土地用途
            TDYT_14 = reFunction(
                f'土地用途为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 容积率
            RJL_15 = reFunction(
                f'容积率\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 建筑系数
            JZXS_16 = reFunction(
                f'建筑系数\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 建筑密度
            JZMD_17 = reFunction(
                f'建筑密度\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 绿地率
            LDL_18 = reFunction(
                f'绿地率[为]*\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 建筑限高
            JZXG_19 = reFunction(
                f'建筑限高\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,。,]', items)
            # 准入产业类别
            ZRHYNB_20 = reFunction(
                f'准入产业类别为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 投资强度
            TZQD_21 = reFunction(
                f'投资强度\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items)
            # 出让年限
            CRNX_22 = reFunction(
                f'出让年[限期]为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 起始价
            QSJ_23 = reFunction(
                f'起始价为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。',
                items).replace('人民币', '')
            # 保证金
            BZJ_24 = reFunction(
                f'保证金人民币\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 保证金到账截止时间
            BZJJZSJ_25 = reFunction(
                f'保证金到账期限为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 报名截止时间
            BMJZSJ_26 = reFunction(
                f'报名截止时间为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,。,]', items)
            # 公告期
            GGQ_27 = reFunction(
                f'公告日期为\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,,]', items)
            # 挂牌时间
            GPSJ_28_ = reFunction(
                f'挂牌时间自\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,]', items)
            GPSJ_28 = GPSJ_28_ if GPSJ_28_ else reFunction(
                f'挂牌时间自\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)[,]', items)
            # 挂牌地点
            GPDD_29_ = reFunction(
                '挂牌时间自(?:\d{4}年\d{1,2}月\d{1,2}日至\d{4}年\d{1,2}月\d{1,2}日[上下午]*\d{1,2}:\d{1,2})\s*([()\w\.::  \(\)〔〕,,㎡≤≥《》\-\/\%,、\.﹪]*)。',
                items).strip(',').strip(',')
            GPDD_29 = GPDD_29_ if GPDD_29_ else reFunction(
                '挂牌时间自(?:\d{4}年\d{1,2}月\d{1,2}日至\d{4}年\d{1,2}月\d{1,2}日\d{1,2}:\d{1,2})\s*([()\w\.::  \(\)〔〕,,㎡≤≥《》\-\/\%,、\.﹪]*)。',
                items).strip(',').strip(',')
            # 增价幅度
            ZJFD_30 = reFunction(
                f'增价幅度为[人民币]*\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)。', items)
            # 联系单位
            LXDW_31 = reFunction(
                f'联系单位:\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s', items)
            # 联系人
            LXR_32 = reFunction(
                f'联系人:\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s',
                items).split('联系电话')[0]
            # 联系电话
            LXDH_33 = reFunction(
                f'联系电话:\s*([()\w\.::  \(\)〔〕㎡≤≥《》\-\/\%,、\.﹪]*)\s', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_6,
                FBBT_7,
                ZWBT_8,
                GGNX_9,
                DKZL_10,
                SZ_11,
                TDXZ_12,
                CRTDMJ_13,
                TDYT_14,
                RJL_15,
                JZXS_16,
                JZMD_17,
                LDL_18,
                JZXG_19,
                ZRHYNB_20,
                TZQD_21,
                CRNX_22,
                QSJ_23,
                BZJ_24,
                BZJJZSJ_25,
                BMJZSJ_26,
                GGQ_27,
                GPSJ_28,
                GPDD_29,
                ZJFD_30,
                LXDW_31,
                LXR_32,
                LXDH_33,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #16
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '')
            # 共有字段
            supplyLandTitle = response.meta.get('supplyLandTitle')
            administration =  response.meta.get('administration')
            publishTime =  response.meta.get('publishTime')
            # detailPage
            # 写入时, 没有的字段置为空
            totalSupplyLand = ''
            yearSupplyPlan = ''
            industrialLand = ''
            businessLand = ''
            totalHousionSupply = ''
            low_rentLand = ''
            affordableHousing = ''
            pengGaiLand = ''
            low_rentpengGaiLand = ''
            pengGaiAffordableHousing = ''
            pengGaiCommercialHousing = ''
            commercialHousing = ''
            ortherHousingLand = ''
            publicServiceLand = ''
            transportationLand = ''
            waterAreaLand = ''
            specialLand = ''
            publicRentalLand = ''
            limitCommercialLand = ''
            mediumCommercialLand = ''
            totalCommercialLand = ''
            commercialRatio = ''

            if '公共管理与公共服务用地' in items and '合计' in items and '特殊用地' in items and '水域及水利设施用地' in items and reFunction('经济适用房用[地](?:\s*)([\S\s]*)(?:\s*)棚改用地', items):
                # 文件标题
                fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first()
                # 总供应面积合计
                totalSupplyLand = reFunction('合计(?:\s*)([\d\.]*)(?:\s*)', items)
                # 供应计划年度
                yearSupplyPlan = reFunction('(\d{4})年度国有建设用地供应计划', items)
                # 工矿仓储用地: 供应面积(公顷)、新增、存量、
                industrialLand = reFunction('工矿仓储用地(?:\s*)([\S\s]*)(?:\s*)商服用地', items)
                # 商服用地: 供应面积(公顷)、新增、存量
                businessLand = reFunction('商服用地(?:\s*)([\S\s]*)(?:\s*)住宅用地', items)
                # # 住房供地总量
                # totalHousionSupply = reFunction('小计(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)', items)[0] if reFunction('小计(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)([\d\.]*)(?:\s*)', items) else ' '
                # 住宅用地 - 廉租房用地: 供应面积(公顷)、新增、存量、
                low_rentLand = '|'.join(reFunction('廉租房用地(?:\s*)([\S\s]*)(?:\s*)棚改用地', items))
                # 住宅用地经济适用房用地: 供应面积(公顷)、新增、存量
                affordableHousing = '|'.join(reFunction('经济适用房用[地](?:\s*)([\S\s]*)(?:\s*)棚改用地', items))
                # 住宅用地 - 棚改用地
                pengGaiLand = '|'.join(reFunction('棚改用地(?:\s*)([\S\s]*)(?:\s*)经济适用房用', items))
                # # 住宅用地棚改用地廉租房: 供应面积(公顷) ,新增、存量
                # low_rentpengGaiLand = '|'.join(reFunction('棚改用地(?:\s*)([\S\s]*)(?:\s*)经济适用房用', items))
                # 住宅用地 - 商品房用地: 供应面积(公顷)、新增、存量
                commercialHousing = '|'.join(reFunction('商品房用地(?:\s*)([\S\s]*)(?:\s*)其他用地', items))
                # 住宅用地 - 其他用地: 供应面积(公顷)、新增、存量
                ortherHousingLand = '|'.join(reFunction('其他用地(?:\s*)([\S\s]*)(?:\s*)小计', items))
                # 公共管理与公共服务用地: 供应面积(公顷)、新增、存量
                publicServiceLand = '|'.join(reFunction('公共管理与公共服务用地(?:\s*)([\S\s]*)(?:\s*)交通运输用地', items))
                # 交通运输用地: 供应面积(公顷)、新增、存量
                transportationLand = '|'.join(reFunction('交通运输用地(?:\s*)([\S\s]*)(?:\s*)水域及水利设施用地', items))
                # 水域及水利设施用地: 供应面积(公顷)、新增、存量
                waterAreaLand = '|'.join(reFunction('水域及水利设施用地(?:\s*)([\S\s]*)(?:\s*)特殊用地', items))
                # 特殊用地: 供应面积(公顷)、新增、存量
                specialLand = '|'.join(reFunction('特殊用地(?:\s*)([\S\s]*)(?:\s*)合计', items))
                # 唯一标识
                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # 爬取地址url
                url = response.url
                md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url)
                csvFile = [administration, supplyLandTitle, publishTime, fileTitle, totalSupplyLand, yearSupplyPlan,
                           industrialLand, businessLand,
                           totalHousionSupply, low_rentLand, affordableHousing, pengGaiLand, low_rentpengGaiLand,
                           pengGaiAffordableHousing, pengGaiCommercialHousing, commercialHousing, ortherHousingLand,
                           publicServiceLand,
                           transportationLand, waterAreaLand, specialLand, publicRentalLand, limitCommercialLand,
                           mediumCommercialLand,
                           totalCommercialLand, commercialRatio, crawlingTime, url, md5Mark]
                self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile]))
                self.fileDetail.write('\n')
                yield

            elif '总供应面积合计:' in items and '各类棚户区改造用地' in items \
                    and reFunction('总供应面积合计:(?:\s*)([\S\s]*)(?:\s*)供应计划年度', items) \
                    and len(re.split('\s*', reFunction('商品住房(?:\s*)([\s\S]*)', reFunction('总 量(?:\s*)([\s\S]*)%', items)))) > 3:
                # 文件标题
                fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first()
                for item in [reFunction('([\S\s]*)(?:[\d\.]*%)', '总供应面积合计:' + _) for _ in re.findall('([\s\S]*)', items)[0].split('总供应面积合计:')[1:]]:
                    # 总供应面积合计
                    totalSupplyLand = reFunction('总供应面积合计:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 供应计划年度
                    yearSupplyPlan = reFunction('供应计划年度:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 商服用地: 供应面积(公顷)、新增、存量
                    businessLand = reFunction('商服用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 工矿仓储用地: 供应面积(公顷)、新增、存量、
                    industrialLand = reFunction('工矿仓储用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # businessLand
                    # 住房供地总量
                    totalHousionSupply = reFunction('住房供地总量:(?:\s*)([\S\s]*)(?:\s*)其中存量', item)
                    # 先获取数字在一一对应
                    dataList = re.split('\s*', reFunction('商品住房(?:\s*)([\s\S]*)', reFunction('总 量(?:\s*)([\s\S]*)[%]?', item)))
                    # 住宅用地 - 廉租房用地: 供应面积(公顷)、新增、存量、
                    low_rentLand = dataList[0]
                    # 住宅用地经济适用房用地: 供应面积(公顷)、新增、存量
                    affordableHousing = dataList[1]
                    # 住宅用地 - 棚改用地  - 总量
                    pengGaiLand = dataList[2]
                    # 住宅用地棚改用地廉租房: 供应面积(公顷) 新增、存量
                    low_rentpengGaiLand = dataList[3]
                    # 住宅用地 - 棚改用地经济适用房用地: 供应面积公顷)、新增、存量
                    pengGaiAffordableHousing = dataList[4]
                    # 住宅用地 - 棚改用地 - 中小套型商品住房: 供应面积(公顷)、新增、存量
                    pengGaiCommercialHousing = dataList[5]
                    # # 住宅用地 - 商品房用地: 供应面积(公顷)、新增、存量
                    # commercialHousing = dataList[9]
                    # 公共租赁房: 划拨用地面积、出让用地面积
                    publicRentalLand = dataList[6] + '|' + dataList[7]
                    # 限价商品房用地面积
                    limitCommercialLand = dataList[8]
                    # 商品住房用地 - 中小套型商品住房用地
                    mediumCommercialLand = dataList[10]
                    # 商品住房用地 - 总量
                    totalCommercialLand = dataList[9]
                    # 保障性安居工程和中小套型商品房用地占比( %)
                    commercialRatio = dataList[11] + '%' if '%' not in dataList[11] else dataList[11]
                    # 公共管理与公共服务用地: 供应面积(公顷)、新增、存量
                    publicServiceLand = reFunction('公共管理与服务用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 交通运输用地: 供应面积(公顷)、新增、存量
                    transportationLand = reFunction('交通运输用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 水域及水利设施用地: 供应面积(公顷)、新增、存量
                    waterAreaLand = reFunction('水域及水利设施用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 特殊用地: 供应面积(公顷)、新增、存量
                    specialLand = reFunction('特殊用地:(?:\s*)([\d\w\.]*)(?:\s*)', item)
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # 爬取地址url
                    url = response.url
                    md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url)
                    csvFile = [administration, supplyLandTitle, publishTime, fileTitle, totalSupplyLand, yearSupplyPlan,
                               industrialLand, businessLand,
                               totalHousionSupply, low_rentLand, affordableHousing, pengGaiLand, low_rentpengGaiLand,
                               pengGaiAffordableHousing, pengGaiCommercialHousing, commercialHousing, ortherHousingLand,
                               publicServiceLand,
                               transportationLand, waterAreaLand, specialLand, publicRentalLand, limitCommercialLand,
                               mediumCommercialLand,
                               totalCommercialLand, commercialRatio, crawlingTime, url, md5Mark]
                    self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile]))
                    self.fileDetail.write('\n')
                    yield
            else:
                # 文件标题
                fileTitle = data.xpath('//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()').extract_first()
                # 商服用地:供应面积(公顷)、新增、存量
                businessLand = reFunction('商服用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 工矿仓储用地:供应面积(公顷)、新增、存量、
                industrialLand = reFunction('工矿仓储用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 住房供地总量
                totalHousionSupply = reFunction('住宅用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 公共管理与公共服务用地
                publicServiceLand = reFunction('公共管理与公共服务用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 交通运输用地:供应面积(公顷)、新增、存量
                transportationLand = reFunction('交通运输用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 水域及水利设施用地:供应面积(公顷)、新增、存量
                waterAreaLand = reFunction('水域及水利设施用地[占]?(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)
                # 特殊用地: 供应面积(公顷)、新增、存量
                specialLand = reFunction('特殊用地(?:\s*)([[\d}/\.{]*)(?:\s*)公顷', items)

                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # 爬取地址url
                url = response.url
                md5Mark = encrypt_md5(fileTitle + totalSupplyLand + yearSupplyPlan + url)
                csvFile = [administration,supplyLandTitle,publishTime,fileTitle,totalSupplyLand,yearSupplyPlan,industrialLand,businessLand,
                            totalHousionSupply,low_rentLand,affordableHousing,pengGaiLand,low_rentpengGaiLand,
                            pengGaiAffordableHousing,pengGaiCommercialHousing,commercialHousing,ortherHousingLand,publicServiceLand,
                            transportationLand,waterAreaLand,specialLand,publicRentalLand,limitCommercialLand,mediumCommercialLand,
                            totalCommercialLand,commercialRatio, crawlingTime, url, md5Mark]
                self.fileDetail.write(','.join([_.replace(',', ' ').replace('\n', '').replace('\r', '') if _ else _ for _ in csvFile]))
                self.fileDetail.write('\n')
                yield

        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #17
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 按照宗地编号来获取一页有几条数据
            # dataCount = len(list(filter(None, re.findall('宗地编号', items))))
            # 共有字段
            fileTitle = data.xpath(
                '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()'
            ).extract_first()
            textTitle = data.xpath(
                '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()'
            ).extract_first()
            noticeType = response.meta.get('noticeType').strip()
            administration = response.meta.get('administration').strip()
            supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip()
            publishTime = response.meta.get('publishTime').strip()
            parcelNumber = ''
            parcelArea = ''
            parcelLocation = ''
            transferTimeLimit = ''
            plotRatio = ''
            buildingDensity = ''
            greenRatio = ''
            buldingHP = ''
            landPurpose = ''
            investmentIntensity = ''
            cashDeposit = ''
            evaluateNum = ''
            landCondition = ''
            startPrice = ''
            bidIncrenment = ''
            hangOutDeadTime = ''
            hangOutStartTime = ''
            supportingInfrastructure = ''
            landItact = ''
            sewageDisposalFacility = ''
            remark = ''
            transferTime = reFunction(
                u'申请人可于((?:[\w\s\u4e00-\u9fa5]*)至(?:[\s\w\u4e00-\u9fa5]*))到',
                reFunction('四、[\s\S]*五、', items)).strip()
            transferAddr = reFunction(
                u'申请人可于(?:[\w\s\u4e00-\u9fa5]*)至(?:[\s\w\u4e00-\u9fa5]*)到 ([\s\S\w\u4e00-\u9fa5.\n\r]*出让文件)',
                reFunction('四、[\s\S]*五、',
                           items)).strip().replace('获取 挂牌 出让文件', '')

            try:
                time1 = reFunction(u'保证金的截止时间为([\w\s\u4e00-\u9fa5]*)。',
                                   reFunction('五、[\s\S]*六、', items)).strip()
                time2 = reFunction(u'将在([\w\s\u4e00-\u9fa5]*)前确认其竞买资格',
                                   reFunction('五、[\s\S]*六、', items)).strip()
                # 保证金截止时间
                # time.strftime("%Y-%m-%d %H:%M", time.strptime('2020年05月19日09时00分', u"%Y年%m月%d日%H时%M分"))
                depositTime = time.strftime(
                    "%Y-%m-%d %H:%M", time.strptime(time1, u"%Y年%m月%d日%H时%M分"))
                # 确认竞买资格时间
                affirmBuyTime = time.strftime(
                    "%Y-%m-%d %H:%M", time.strptime(time2, u"%Y年%m月%d日%H时%M分"))
            except:
                # 保证金截止时间
                depositTime = time1
                # 确认竞买资格时间
                affirmBuyTime = time2
            # 联系地址
            address = reFunction(u'联系地址:([\s\S]*)联 系 人',
                                 reFunction('八、[\s\S]*', items)).strip()
            # 电话
            tel = reFunction(u'联系电话:([\s\S]*)开户单位',
                             reFunction('八、[\s\S]*', items)).strip()
            # 联系人
            linkman = reFunction(u'联系电话:([\s\S]*)开户单位',
                                 reFunction('八、[\s\S]*', items)).strip()
            # 开户单位
            accountOpener = reFunction(u'开户单位:([\s\S]*)开户银行',
                                       reFunction('八、[\s\S]*', items)).strip()
            # 开户银行
            depositBank = reFunction(u'开户银行:([\s\S]*)银行帐号',
                                     reFunction('八、[\s\S]*', items)).strip()
            # 银行帐号
            bankAccount = reFunction(u'银行帐号:([\w]*)(?:[\S]*)',
                                     reFunction('八、[\s\S]*', items)).strip()

            if '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)宗地总面积',
                                              item).strip()
                    # 宗地面积	parcelArea
                    parcelArea = reFunction('宗地总面积:(?:\s*)([\w}/{]*)(?:\s*)',
                                            item).strip()
                    # 宗地坐落	parcelLocation
                    parcelLocation = reFunction('宗地坐落:(?:\s*)([\s\S]*)出让年限',
                                                item).strip()
                    # 岀让年限 	transferTimeLimit
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\w}/{]*)(?:\s*)', item).strip()
                    # 容积率	plotRatio
                    plotRatio = reFunction('容积率:(?:\s*)([\w}/{]*)(?:\s*)',
                                           item).strip()
                    # 建筑密度(%)	buildingDensity
                    buildingDensity = reFunction('建筑密度\(%\):([\s\S]*)绿化率',
                                                 item).strip()
                    # 绿地率(%)	greenRatio
                    greenRatio = reFunction('绿[地化]率\(%\):([\s\S]*)建筑限高',
                                            item).strip()
                    # 建筑限高(米)	buldingHP
                    buldingHP = reFunction('建筑限高\(米\):(?:\s*)([\w}{/]*)主要用途',
                                           item).strip()
                    # 土地用途	landPurpose
                    landPurpose = reFunction('主要用途:(?:\s*)([\w}{/]*)(?:\s*)',
                                             item).strip()
                    # 投资强度 investmentIntensity
                    investmentIntensity = reFunction(
                        '投资强度:(?:\s*)([\w}{/]*)(?:\s*)保证金', item).strip()
                    # 保证金	cashDeposit
                    cashDeposit = reFunction('保证金:(?:\s*)([\w}{/]*)(?:\s*)',
                                             item).strip()
                    # 估价报告备案号	evaluateNum
                    evaluateNum = reFunction(
                        '估价报告备案号(?:\s*)([A-Za-z0-9_}{/]*)(?:\s*)',
                        item).strip()
                    # 现状土地条件	landCondition
                    landCondition = reFunction(
                        '([:\u4e00-\u9fa5 ]*)',
                        reFunction('估价报告备案号:([\s\S]*)起始价', item)).strip()
                    # TODO 起始价	startPrice
                    startPrice = reFunction('起始价:(?:\s*)([\w}/{]*)(?:\s*)',
                                            item).strip()
                    # 加价幅度	bidIncrenment
                    bidIncrenment = reFunction('加价幅度:(?:\s*)([\w}/{]*)(?:\s*)',
                                               item).strip()
                    try:
                        time3 = reFunction(
                            '挂牌[(竞价)]*截止时间:(?:\s*)([\w}/{]*)(?:\s*)',
                            item).strip()
                        time4 = reFunction(
                            '挂牌[(竞价)]*开始时间:(?:\s*)([\w}/{]*)(?:\s*)',
                            item).strip()
                        # 挂牌截止时间
                        hangOutDeadTime = time.strftime(
                            "%Y-%m-%d %H:%M",
                            time.strptime(time3, u"%Y年%m月%d日%H时%M分"))
                        # 挂牌开始时间
                        hangOutStartTime = time.strftime(
                            "%Y-%m-%d %H:%M",
                            time.strptime(time4, u"%Y年%m月%d日%H时%M分"))
                    except:
                        # 保证金截止时间
                        depositTime = time3
                        # 确认竞买资格时间
                        affirmBuyTime = time4
                    # 基础设施配套	supportingInfrastructure
                    supportingInfrastructure = reFunction(
                        '基础设施配套:(?:\s*)([\w}/{]*)(?:\s*)', item).strip()
                    # 是否土地平整	landItact
                    landItact = reFunction(
                        '是否土地平整[: :](?:\s*)([\w}/{]*)(?:\s*)', item).strip()
                    # 排污设施状况	sewageDisposalFacility
                    sewageDisposalFacility = reFunction(
                        '排污设施状况:(?:\s*)([\w}/{]*)(?:\s*)', item).strip()
                    # 备注	remark
                    remark = reFunction('备注:([\s\S]*)(?:\s*)', item).strip()

                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(fileTitle + publishTime +
                                          transferTime + url)

                    csvFile = [
                        fileTitle, textTitle, noticeType, administration,
                        supplyNoticeTitle, publishTime, transferTime,
                        transferAddr, depositTime, affirmBuyTime, address, tel,
                        linkman, accountOpener, depositBank, bankAccount,
                        parcelNumber, parcelArea, parcelLocation,
                        transferTimeLimit, plotRatio, buildingDensity,
                        greenRatio, buldingHP, landPurpose,
                        investmentIntensity, cashDeposit, evaluateNum,
                        landCondition, startPrice, bidIncrenment,
                        hangOutDeadTime, hangOutStartTime,
                        supportingInfrastructure, landItact,
                        sewageDisposalFacility, remark, crawlingTime, url,
                        md5Mark, '\n'
                    ]
                    # 存储数据
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')

            yield
            # TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
コード例 #18
0
    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_27 = ''
            SJ_28 = ''
            LY_29 = ''
            WJBT_30 = ''
            ZDBH_31 = ''
            BH_32 = ''
            DKWZ_33 = ''
            TDWZ_34 = ''
            TDMJM_35 = ''
            TDMJPFM_36 = ''
            TDYT_37 = ''
            CJJ_38 = ''
            JDR_39 = ''
            GSQ_40 = ''
            LXDW_41 = ''
            DWDZ_42 = ''
            YZBM_43 = ''
            LXDH_44 = ''
            # TODO 共有字段  reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_27 = response.meta.get('title')
            # 时间
            SJ_28 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_29 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 文件编号
            WJBT_30 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 公示期
            GSQ_40 = reFunction(
                f'公示期:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)。', items)
            # 联系单位
            LXDW_41 = reFunction(
                '联系单位:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_42 = reFunction(
                '单位地址:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_43 = reFunction(
                '邮政编码:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 联系电话
            LXDH_44 = reFunction(
                '联系电话:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_27 + SJ_28)

            soup = BeautifulSoup(
                response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if table:
                if '竣工时间' in items:
                    try:
                        tdData = htmlTable.tableTrTdUNregulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_31 = tdData.get('地块编号')[_] if tdData.get(
                                '地块编号') else ''
                            # 地块位置
                            DKWZ_33 = tdData.get('位置')[_] if tdData.get(
                                '位置') else ''
                            # 土地位置
                            TDWZ_34 = tdData.get('位置')[_] if tdData.get(
                                '位置') else ''
                            # 土地面积(亩)
                            TDMJM_35 = tdData.get(
                                '出让面积平方米/亩')[_] if tdData.get(
                                    '出让面积平方米/亩') else ''
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.get(list(
                                tdData.keys())[7])[_] if tdData.get(
                                    list(tdData.keys())[7]) else ''
                            # 土地用途
                            TDYT_37 = tdData.get('用途')[_] if tdData.get(
                                '用途') else ''
                            # 成交价(万元)
                            CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get(
                                '成交价(万元)') else tdData.get(
                                    '成交价(万元)')[_] if tdData.get(
                                        '成交价(万元)') else ''
                            # 竞得人
                            JDR_39 = tdData.get('受让人')[_] if tdData.get(
                                '受让人') else ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                    except:
                        for tdData in table.find_all('tr')[2:]:
                            # 宗地编号
                            ZDBH_31 = tdData.find_all('td')[4].string.strip()
                            # 地块位置
                            DKWZ_33 = tdData.find_all('td')[5].string.strip()
                            # 土地位置
                            TDWZ_34 = tdData.find_all('td')[5].string.strip()
                            # 土地面积(亩)
                            TDMJM_35 = tdData.find_all('td')[6].string.strip()
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.find_all(
                                'td')[7].string.strip()
                            # 土地用途
                            TDYT_37 = tdData.find_all('td')[8].string.strip()
                            # 成交价(万元)
                            CJJ_38 = tdData.find_all('td')[9].string.strip()
                            # 竞得人
                            JDR_39 = tdData.find_all('td')[3].string.strip()
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                elif '转让方' not in items:
                    if len(table.find_all('tr')[1].find_all('td')) < 5:
                        table.find_all('tr')[1].extract()
                        table.find_all('tr')[0].find_all('td')[-1].extract()
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    for _ in range(len(list(tdData.values())[0])):
                        # 宗地编号
                        ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get(
                            '宗地编号') else ''
                        # 编号
                        BH_32 = tdData.get('编号')[_] if tdData.get('编号') else ''
                        # 地块位置
                        DKWZ_33 = tdData.get('地块位置')[_] if tdData.get(
                            '地块位置') else ''
                        # 土地位置
                        TDWZ_34 = tdData.get('土地位置')[_] if tdData.get(
                            '土地位置') else ''
                        # 土地面积(亩)
                        TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get(
                            '土地面积(亩)') else ''
                        # 土地面积(平方米)
                        TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get(
                            '土地面积(平方米)') else ''
                        # 土地用途
                        TDYT_37 = tdData.get('土地用途')[_] if tdData.get(
                            '土地用途') else ''
                        # 成交价(万元)
                        CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get(
                            '成交价(万元)') else tdData.get(
                                '成交价(万元)')[_] if tdData.get('成交价(万元)') else ''
                        # 竞得人
                        JDR_39 = tdData.get('竞得人')[_] if tdData.get(
                            '竞得人') else ''

                        # 写入数据
                        if self.name in DUPLICATE_SWITCH_LIST:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if TDYT_37:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    WJBT_27,
                                    SJ_28,
                                    LY_29,
                                    WJBT_30,
                                    ZDBH_31,
                                    BH_32,
                                    DKWZ_33,
                                    TDWZ_34,
                                    TDMJM_35,
                                    TDMJPFM_36,
                                    TDYT_37,
                                    CJJ_38,
                                    JDR_39,
                                    GSQ_40,
                                    LXDW_41,
                                    DWDZ_42,
                                    YZBM_43,
                                    LXDH_44,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(
                                                ',',
                                                ' ').replace('\n', '').replace(
                                                    '\r', '').replace(
                                                        r'\xa0', '').replace(
                                                            '\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(
                                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                            level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                elif '地块基本情况' in items:
                    # 宗地编号
                    ZDBH_31 = reFunction(
                        '宗地编号\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 地块位置
                    DKWZ_33 = reFunction(
                        '地块位置\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 土地面积(亩)
                    TDMJM_35 = reFunction(
                        '土地面积\(公顷\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 土地用途
                    TDYT_37 = reFunction(
                        '土地用途\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 成交价(万元)
                    CJJ_38 = reFunction(
                        '成交价\(万元\)\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)
                    # 竞得人
                    JDR_39 = reFunction(
                        '受让单位\s*([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                        items)

                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if TDYT_37:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_27,
                                SJ_28,
                                LY_29,
                                WJBT_30,
                                ZDBH_31,
                                BH_32,
                                DKWZ_33,
                                TDWZ_34,
                                TDMJM_35,
                                TDMJPFM_36,
                                TDYT_37,
                                CJJ_38,
                                JDR_39,
                                GSQ_40,
                                LXDW_41,
                                DWDZ_42,
                                YZBM_43,
                                LXDH_44,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '转让方' in items:
                # 编号
                BH_32 = reFunction(
                    '不动产权登记证号:([()【】\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s',
                    items)
                # 地块位置
                DKWZ_33 = reFunction(
                    '宗地位置:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 土地面积(平方米)
                TDMJPFM_36 = reFunction(
                    '面\s*积:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 土地用途
                TDYT_37 = reFunction(
                    '土地用途:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 成交价(万元)
                # CJJ_38
                # 竞得人
                JDR_39 = reFunction(
                    '受让方:([()\w\.:: —\(\)〔〕㎡㎡≤≥《》\-\/\%,;,、\.﹪]*)\s', items)
                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if TDYT_37:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_27,
                            SJ_28,
                            LY_29,
                            WJBT_30,
                            ZDBH_31,
                            BH_32,
                            DKWZ_33,
                            TDWZ_34,
                            TDMJM_35,
                            TDMJPFM_36,
                            TDYT_37,
                            CJJ_38,
                            JDR_39,
                            GSQ_40,
                            LXDW_41,
                            DWDZ_42,
                            YZBM_43,
                            LXDH_44,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)
コード例 #19
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # TODO 共有字段
            # 文件标题
            WJBT_34 = response.meta.get('title')
            # 发布时间
            FBSJ_35 = response.meta.get('ND')

            # 地块编号
            DKBH_36 = reFunction(f'地块编号:\s*([{self.reStr}]*)\s', items)
            # 具体位置
            JTWZ_37 = reFunction(f'具体位置:\s*([{self.reStr}]*)\s', items)
            # 出让面积
            CRMJ_38 = reFunction(f'出让面积:\s*([{self.reStr}]*)\s', items)
            # 容积率
            RJL_39 = reFunction(
                f'容积率:\s*([{self.reStr}]*)\s', items) if reFunction(
                    f'容积率:\s*([{self.reStr}]*)\s', items) else reFunction(
                        f'容 积 率:\s*([{self.reStr}]*)\s', items)
            # 用途
            YT_40 = reFunction(
                f'用途:\s*([{self.reStr}]*)\s', items) if reFunction(
                    f'用途:\s*([{self.reStr}]*)\s', items) else reFunction(
                        f'用  途:\s*([{self.reStr}]*)\s', items)
            # 供地方式
            GDFS_41 = reFunction(f'供地方式:\s*([{self.reStr}]*)\s', items)
            # 使用年限
            SYNX_42 = reFunction(f'使用年限:\s*([{self.reStr}]*)\s', items)
            # 竞得(人)
            JDR_43 = reFunction(f'竞得\(人\):\s*([{self.reStr}]*)\s', items)
            # 成交价格
            CJJG_44 = reFunction(f'成交价格:\s*([{self.reStr}]*)\s', items)
            # 成交日期
            CJRQ_45 = reFunction(f'成交日期:\s*([{self.reStr}]*)\s', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                WJBT_34,
                FBSJ_35,
                DKBH_36,
                JTWZ_37,
                CRMJ_38,
                RJL_39,
                YT_40,
                GDFS_41,
                SYNX_42,
                JDR_43,
                CJJG_44,
                CJRQ_45,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #20
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_18 = ''
            LY_19 = ''
            GXSJ_20 = ''
            GGSJ_21 = ''
            GGMT_22 = ''
            GGH_23 = ''
            CRFS_24 = ''
            CJSJ_25 = ''
            CJDD_26 = ''
            DKBH_27 = ''
            DKWZ_28 = ''
            TDYT_29 = ''
            GPQSJ_30 = ''
            JDDW_31 = ''
            CJJE_32 = ''

            # TODO 共有字段
            # 文件标题
            WJBT_18 = response.meta.get('title')
            # 文章来源
            WZLY_19 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                 ).extract_first().replace('文章来自:', '')
            # 更新时间
            GXSJ_20 = data.xpath('//div[@class="news_time"]/span[1]/text()'
                                 ).extract_first().replace('更新时间:', '')

            # TODO //table[@border="1"]   //table[@border="0"]
            soup = BeautifulSoup(response.body.decode('utf-8'))
            tables = soup.find_all('table')

            tablesCopy = BeautifulSoup(
                response.body.decode('utf-8')).find_all('table')

            for _ in range(len(tables)):
                table = tables[_]
                tableCopy = tablesCopy[_]
                # table 解析 首先解析第一行, 删除异常行,
                trList = table.tbody.find_all('tr')
                for _ in range(len(trList)):
                    if not trList[_].find_all('td', text=re.compile(
                            "公告时间")) and not trList[_].find_all(
                                'p', text=re.compile("公告时间")):
                        table.tbody.find_all('tr')[0].extract()  # 处理异常行
                        continue
                    break

                trListCopy = tableCopy.tbody.find_all('tr')
                for _ in range(len(trListCopy)):
                    if not trListCopy[_].find_all(
                            'td', text=re.compile(
                                "公告时间")) and not trListCopy[_].find_all(
                                    'p', text=re.compile("公告时间")):
                        tableCopy.tbody.find_all('tr')[0].extract()  # 处理异常行
                        continue
                    break

                for _ in range(2, len(tableCopy.tbody.find_all('tr'))):
                    try:
                        tableCopy.tbody.find_all('tr')[2].extract()
                    except:
                        pass
                htmlTable = htmlTableTransformer()
                tdDataCopy = htmlTable.tableTrTdRegulation(tableCopy)
                # 公告时间
                GGSJ_21 = tdDataCopy.get('公告时间')
                # 公告媒体
                GGMT_22 = tdDataCopy.get('公告媒体')
                # 公告号
                GGH_23 = tdDataCopy.get('公告号')
                # 出让方式
                CRFS_24 = tdDataCopy.get('出让方式')
                # 成交时间
                CJSJ_25 = tdDataCopy.get('成交时间')
                # 成交地点
                CJDD_26 = tdDataCopy.get('成交地点')

                # TODO 解析第二行
                for _ in range(2):
                    try:
                        table.tbody.find_all('tr')[0].extract()
                    except:
                        pass
                htmlTable = htmlTableTransformer()
                tdData = htmlTable.tableTrTdRegulation(table)
                # 地块编号
                DKBH_27 = tdData.get('地块编号')
                # 地块位置
                DKWZ_28 = tdData.get('地块位置')
                # 土地用途
                TDYT_29 = tdData.get('土地用途')
                # 挂牌起始价
                GPQSJ_30 = tdData.get('挂牌起始价(万元)') if tdData.get(
                    '挂牌起始价(万元)') else tdData.get('挂牌起始价(元)')
                # 竞得单位
                JDDW_31 = tdData.get('竞得人(单位)') if tdData.get(
                    '竞得人(单位)') else tdData.get('竞得单位')
                # 成交金额
                CJJE_32_ = tdData.get('成交价(万元)') if tdData.get(
                    '成交价(万元)') else tdData.get('成交金额(元)')
                CJJE_32 = CJJE_32_ if CJJE_32_ else tdData.get('成交价')
                # 爬取时间
                crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + WJBT_18 + GXSJ_20)

                # 存储数据
                csvFile = [
                    WJBT_18,
                    LY_19,
                    GXSJ_20,
                    GGSJ_21,
                    GGMT_22,
                    GGH_23,
                    CRFS_24,
                    CJSJ_25,
                    CJDD_26,
                    DKBH_27,
                    DKWZ_28,
                    TDYT_29,
                    GPQSJ_30,
                    JDDW_31,
                    CJJE_32,
                    crawlingTime,
                    url,
                    md5Mark,
                ]
                results = ''
                for _ in csvFile:
                    try:
                        if _ and _ != '|' * len(_):
                            results += _.replace(',', ' ').replace(
                                '\n', '').replace('\r', '').replace(
                                    r'\xa0', '').replace('\xa0', '') + ','
                        else:
                            results += ','
                    except Exception as e:
                        results += ','
                        self.log(
                            f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                            level=logging.ERROR)
                with open(self.pathDetail, 'a+') as fp:
                    fp.write(results)
                    fp.write('\n')
                self.log(f'数据获取成功', level=logging.INFO)
                yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #21
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # TODO 共有字段
            # 年度
            ND_1 = response.meta.get('ND').split('-')[0] if response.meta.get(
                'ND') else ''
            '''商服用地  工矿仓储用地  基础设施及公益事业类划拨用地'''
            # TODO 先匹配 结构 在解析表格  结构优先,不为空先取结构
            ZFYD_2_0, ZFYD_2_1, ZFYD_2_2 = '', '', ''
            SFYD_3_0, SFYD_3_1, SFYD_3_2 = '', '', ''
            GKCC_4_0, GKCC_4_1, GKCC_4_2 = '', '', ''
            JCSS_5_0, JCSS_5_1, JCSS_5_2 = '', '', ''
            if '指标安排' in items:
                # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
                ZFYD_2_0 = reFunction(f'住房用地\s*([\d\万.]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 商服用地供应计划面积(公顷)
                SFYD_3_0 = reFunction(f'商服用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 工矿仓储用地供应计划面积(公顷)
                GKCC_4_0 = reFunction(f'工矿仓储用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
                # 基础设施及公益事业等划拨用地供应计划面积(公顷)
                JCSS_5_0 = reFunction(f'基础设施及公益事业类划拨用地\s*([\d\.万]*)公顷[,,]',
                                      reFunction('指标安排[\s\S]*布局', items))
            # TODO //table[@class="MsoNormalTable"][1] | //table[@border="1"] 通过表格获取
            if data.xpath(
                    '//table[@class="MsoNormalTable"][1] | //table[@border="1"]'
            ):
                soup = BeautifulSoup(response.body.decode('gbk'))
                tables = soup.find('table', attrs={'class': 'MsoNormalTable'})
                table = tables if tables else soup.find_all('table',
                                                            border="1")[0]
                htmlTable = htmlTableTransformer()
                try:
                    tdData = htmlTable.tableTrTdRegulation(table)
                except:
                    try:
                        table.tbody.find_all('tr')[0].extract()
                        tdData = htmlTable.tableTrTdRegulation(table)
                    except:
                        tdData = {}
                # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
                ZFYD_2_1 = tdData.get('住房用地(含保障性住房用地、商品住房用地)')
                # 商服用地供应计划面积(公顷)
                SFYD_3_1 = tdData.get('商服用地')
                # 工矿仓储用地供应计划面积(公顷)
                GKCC_4_1 = tdData.get('工矿仓储用地')
                # 基础设施及公益事业等划拨用地供应计划面积(公顷)
                JCSS_5_1 = tdData.get('基础设施及公益事业等划拨用地')
            # 住房用地(含保障性住房用地商品住房用地)供应计划面积(公顷)
            ZFYD_2_2List = []
            ZFYD_2_2List.append(
                reFunction(f'全国住房用地计划供应*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2List.append(
                reFunction(f'住房用地计划供应*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2List.append(reFunction(f'住宅用地*\s*([\d\.万]*)公顷[,,]', items))
            ZFYD_2_2 = list(filter(
                lambda x: len(x) > 1, ZFYD_2_2List))[0] if list(
                    filter(lambda x: len(x) > 1, ZFYD_2_2List)) else ''

            # 商服用地供应计划面积(公顷)
            SFYD_3_2_ = reFunction(f'商服用地\s*([\d\.万]*)公顷[,,]', items)
            SFYD_3_2__ = reFunction(
                f'商业用地计划出让\s*([\d\.万]*)公顷[,,]', items) if reFunction(
                    f'商业用地计划出让\s*([\d\.万]*)公顷,', items) else reFunction(
                        f'商业住房用地计划供应\s*([\d\.万]*)公顷[,,]', items)
            SFYD_3_2 = SFYD_3_2_ if SFYD_3_2_ else SFYD_3_2__

            # 工矿仓储用地供应计划面积(公顷
            GKCC_4_2 = reFunction(f'工矿仓储用地\s*([\d\.万]*)公顷[,,]', items)
            # 基础设施及公益事业等划拨用地供应计划面积(公顷)
            JCSS_5_2 = reFunction(f'基础设施及公益事业类划拨用地\s*([\d\.万]*)公顷[,,]', items)

            ZFYD_2 = list(filter(
                None, [ZFYD_2_0, ZFYD_2_1, ZFYD_2_2]))[0] if list(
                    filter(None, [ZFYD_2_0, ZFYD_2_1, ZFYD_2_2])) else ''
            SFYD_3 = list(filter(
                None, [SFYD_3_0, SFYD_3_1, SFYD_3_2]))[0] if list(
                    filter(None, [SFYD_3_0, SFYD_3_1, SFYD_3_2])) else ''
            GKCC_4 = list(filter(
                None, [GKCC_4_0, GKCC_4_1, GKCC_4_2]))[0] if list(
                    filter(None, [GKCC_4_0, GKCC_4_1, GKCC_4_2])) else ''
            JCSS_5 = list(filter(
                None, [JCSS_5_0, JCSS_5_1, JCSS_5_2]))[0] if list(
                    filter(None, [JCSS_5_0, JCSS_5_1, JCSS_5_2])) else ''

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url)

            # 存储数据
            csvFile = [
                ND_1,
                ZFYD_2,
                SFYD_3,
                GKCC_4,
                JCSS_5,
                crawlingTime,
                url,
                md5Mark,
            ]
            results = ''
            for _ in csvFile:
                try:
                    if _ and _ != '|' * len(_):
                        results += _.replace(',', ' ').replace(
                            '\n', '').replace('\r', '').replace(
                                r'\xa0', '').replace('\xa0', '') + ','
                    else:
                        results += ','
                except Exception as e:
                    results += ','
                    self.log(
                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                        level=logging.ERROR)
            with open(self.pathDetail, 'a+') as fp:
                fp.write(results)
                fp.write('\n')
            self.log(f'数据获取成功', level=logging.INFO)
            yield
        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 错误: {e}\n{traceback.format_exc()}',
                     level=logging.ERROR)
コード例 #22
0
    def parse_detail(self, response):
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            BT_18 = ''
            LY_19 = ''
            SJ_20 = ''
            XZQ_21 = ''
            DZJGH_22 = ''
            XMMC_23 = ''
            XMWZ_24 = ''
            MJ_25 = ''
            TDLY_26 = ''
            TSYT_27 = ''
            GDFS_28 = ''
            TDSYNX_29 = ''
            HYFL_30 = ''
            TDJB_31 = ''
            CJJG_32 = ''
            ZFQH_33 = ''
            YDZFRQ_34 = ''
            YDZFJE_35 = ''
            BZ_36 = ''
            TDSTQR_37 = ''
            SX_38 = ''
            XX_39 = ''
            YDJDSJ_40 = ''
            YDKGSJ_41 = ''
            YDJGSJ_42 = ''
            SJKGSJ_43 = ''
            SJJGSJ_44 = ''
            PZDW_45 = ''
            HTQDRQ_46 = ''

            # TODO 共有字段
            # 标题
            BT_18 = response.meta.get('title')
            LY = data.xpath(
                '//div[@class="content-small-title"]/text()').extract_first()
            # 来源
            LY_19 = reFunction(f'来源:\s*([{self.reStr}]*)\s', LY)
            # 时间
            SJ_20 = reFunction(f'时间:\s*([{self.reStr}]*)\s', LY)

            # 解析 table 若出错 使用正则
            htmlTable = htmlTableTransformer()
            if '宗地编号' not in items and '行政区' not in items:
                try:
                    soup = BeautifulSoup(response.body.decode('utf-8'))
                    table = soup.find_all('table')[0]
                    if not table.tbody.find_all('tr')[0].find_all(
                            text=re.compile("用地单位|受让人")):
                        table.tbody.find_all('tr')[0].extract()
                    tdsData = htmlTable.tableTrTdRegulationToList(table)

                    for _ in range(len(list(tdsData.values())[0])):
                        # 项目位置
                        XMWZ_24 = tdsData.get('土地座落')[_] if tdsData.get(
                            '土地座落') else tdsData.get('宗地位置')[_] if tdsData.get(
                                '宗地位置') else ''
                        # 面积
                        MJ_25_0 = tdsData.get('出让面积(公顷)')
                        MJ_25_1 = tdsData.get('出让面积')
                        MJ_25_2 = tdsData.get('出让/划拨面积')
                        MJ_25_ = list(filter(None,
                                             [MJ_25_0, MJ_25_1, MJ_25_2]))
                        MJ_25 = MJ_25_[0][_] if MJ_25_ else ''
                        # 土地用途
                        TSYT_27 = tdsData.get('土地用途')[_] if tdsData.get(
                            '土地用途') else tdsData.get('用途明细')[_] if tdsData.get(
                                '用途明细') else ''
                        # 供地方式
                        GDFS_28 = tdsData.get('供应方式')[_] if tdsData.get(
                            '供应方式') else ''
                        # 土地级别
                        TDJB_31 = tdsData.get('土地级别')[_] if tdsData.get(
                            '土地级别') else ''
                        # 成交价格
                        CJJG_32_0 = tdsData.get('出让价款')
                        CJJG_32_1 = tdsData.get('出让价款(万元)')
                        CJJG_32_2 = tdsData.get('出让/划拨价歀')
                        CJJG_32_ = list(
                            filter(None, [CJJG_32_0, CJJG_32_1, CJJG_32_2]))
                        CJJG_32 = CJJG_32_[0][_] if CJJG_32_ else ''
                        # 土地使用权人
                        TDSTQR_37 = tdsData.get('用地单位')[_] if tdsData.get(
                            '用地单位') else tdsData.get('受让人')[_] if tdsData.get(
                                '受让人') else ''
                        # 合同签订日期
                        HTQDRQ_46 = tdsData.get('签订日期')[_] if tdsData.get(
                            '签订日期') else ''

                        # 爬取时间
                        crawlingTime = time.strftime("%Y-%m-%d",
                                                     time.localtime())
                        # 爬取地址url
                        url = response.url
                        # 唯一标识
                        md5Mark = encrypt_md5(url + LY_19 + SJ_20)

                        # 是否需要判断重复 请求
                        if DUPLICATE_SWITCH:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                BT_18,
                                LY_19,
                                SJ_20,
                                XZQ_21,
                                DZJGH_22,
                                XMMC_23,
                                XMWZ_24,
                                MJ_25,
                                TDLY_26,
                                TSYT_27,
                                GDFS_28,
                                TDSYNX_29,
                                HYFL_30,
                                TDJB_31,
                                CJJG_32,
                                ZFQH_33,
                                YDZFRQ_34,
                                YDZFJE_35,
                                BZ_36,
                                TDSTQR_37,
                                SX_38,
                                XX_39,
                                YDJDSJ_40,
                                YDKGSJ_41,
                                YDJGSJ_42,
                                SJKGSJ_43,
                                SJJGSJ_44,
                                PZDW_45,
                                HTQDRQ_46,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                        else:
                            self.crawler.engine.close_spider(
                                self, 'response msg info %s, job duplicated!' %
                                response.url)
                except Exception as e:
                    pass
            else:
                # 进行正则匹配
                # 行政区
                XZQ_21 = reFunction(f'行政区:([{self.reStr}]*)电子监管号', items)
                # 电子监管号
                DZJGH_22 = reFunction(f'电子监管号:([{self.reStr}]*)项目名称', items)
                # 项目名称
                XMMC_23_ = reFunction(f'项目名称:([{self.reStr}]*)项目位置', items)
                XMMC_23 = XMMC_23_ if XMMC_23_ else reFunction(
                    f'宗地编号([{self.reStr}]*)地块位置', items)
                # 项目位置
                XMWZ_24_ = reFunction(f'项目位置:([{self.reStr}]*)面积(公顷):	', items)
                XMWZ_24 = XMWZ_24_ if XMWZ_24_ else reFunction(
                    f'地块位置([{self.reStr}]*)土地用途', items)
                # 面积
                MJ_25_ = reFunction(f'面积\(公顷\):([{self.reStr}]*)土地来源', items)
                MJ_25 = MJ_25_ if MJ_25_ else reFunction(
                    f'土地面积\(公顷\)([{self.reStr}]*)出让年限', items)
                # 土地来源
                TDLY_26 = reFunction(f'土地来源:([{self.reStr}]*)土地用途', items)
                # 土地用途
                TSYT_27_ = reFunction(f'土地用途:([{self.reStr}]*)供地方式', items)
                TSYT_27 = TSYT_27_ if TSYT_27_ else data.xpath(
                    'string(//table/tbody/tr[5]/td[1])').extract_first()
                # 供地方式
                GDFS_28 = reFunction(f'供地方式:([{self.reStr}]*)土地使用年限', items)
                # 土地使用年限
                TDSYNX_29_ = reFunction(f'土地使用年限:([{self.reStr}]*)行业分类', items)
                TDSYNX_29 = TDSYNX_29_ if TDSYNX_29_ else reFunction(
                    f'出让年限([{self.reStr}]*)成交价\(万元\)', items)
                # 行业分类
                HYFL_30 = reFunction(f'行业分类:([{self.reStr}]*)土地级别', items)
                # 土地级别
                TDJB_31 = reFunction(f'土地级别:([{self.reStr}]*)成交价格\(万元\)',
                                     items)
                # 成交价格
                CJJG_32_ = reFunction(f'成交价格\(万元\):([{self.reStr}]*)分期支付约定',
                                      items)
                CJJG_32 = CJJG_32_ if CJJG_32_ else reFunction(
                    f'成交价格\(万元\)([{self.reStr}]*)明细用途', items)
                # 分期支付约定—支付期号
                ZFQH_33 = data.xpath(
                    '//table/tbody/tr[10]/td[1]/text()').extract_first()
                # 分期支付约定—约定支付日期
                YDZFRQ_34 = data.xpath(
                    '//table/tbody/tr[10]/td[2]/text()').extract_first()
                # 分期支付约定—约定支付金额
                YDZFJE_35 = data.xpath(
                    '//table/tbody/tr[10]/td[3]/text()').extract_first()
                # 分期支付约定—备注
                BZ_36 = data.xpath(
                    'string(//table/tbody/tr[10]/td[4])').extract_first()
                # 土地使用权人
                TDSTQR_37_ = reFunction(f'土地使用权人:([{self.reStr}]*)约定容积率',
                                        items)
                TDSTQR_37 = TDSTQR_37_ if TDSTQR_37_ else reFunction(
                    f'受让单位([{self.reStr}]*)备注', items)
                # 约定容积率——下限
                SX_38 = reFunction(f'下限:([{self.reStr}]*)上限', items)
                # 约定容积率——上限
                XX_39 = reFunction(f'上限:([{self.reStr}]*)约定交地时间', items)
                # 约定交地时间
                YDJDSJ_40 = reFunction(f'约定交地时间:([{self.reStr}]*)约定开工时间',
                                       items)
                # 约定开工时间
                YDKGSJ_41 = reFunction(f'约定开工时间:([{self.reStr}]*)约定竣工时间',
                                       items)
                # 约定竣工时间
                YDJGSJ_42 = reFunction(f'约定竣工时间:([{self.reStr}]*)实际开工时间',
                                       items)
                # 实际开工时间
                SJKGSJ_43 = reFunction(f'实际开工时间:([{self.reStr}]*)实际竣工时间',
                                       items)
                # 实际竣工时间
                SJJGSJ_44 = reFunction(f'实际竣工时间:([{self.reStr}]*)批准单位', items)
                # 批准单位
                PZDW_45 = reFunction(f'批准单位:([{self.reStr}]*)合同签订日期', items)
                # 合同签订日期
                HTQDRQ_46 = reFunction(f'合同签订日期:([{self.reStr}]*)\s', items)

                crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
                # 爬取地址url
                url = response.url
                # 唯一标识
                md5Mark = encrypt_md5(url + LY_19 + SJ_20)

                # 是否需要判断重复 请求
                if DUPLICATE_SWITCH:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    # 重复效验通过, 存储数据
                    csvFile = [
                        BT_18,
                        LY_19,
                        SJ_20,
                        XZQ_21,
                        DZJGH_22,
                        XMMC_23,
                        XMWZ_24,
                        MJ_25,
                        TDLY_26,
                        TSYT_27,
                        GDFS_28,
                        TDSYNX_29,
                        HYFL_30,
                        TDJB_31,
                        CJJG_32,
                        ZFQH_33,
                        YDZFRQ_34,
                        YDZFJE_35,
                        BZ_36,
                        TDSTQR_37,
                        SX_38,
                        XX_39,
                        YDJDSJ_40,
                        YDKGSJ_41,
                        YDJGSJ_42,
                        SJKGSJ_43,
                        SJJGSJ_44,
                        PZDW_45,
                        HTQDRQ_46,
                        crawlingTime,
                        url,
                        md5Mark,
                    ]
                    results = ''
                    for _ in csvFile:
                        try:
                            if _ and _ != '|' * len(_):
                                results += _.replace(',', ' ').replace(
                                    '\n', '').replace('\r', '').replace(
                                        r'\xa0', '').replace('\xa0', '') + ','
                            else:
                                results += ','
                        except Exception as e:
                            results += ','
                            self.log(
                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                level=logging.ERROR)
                    with open(self.pathDetail, 'a+') as fp:
                        fp.write(results)
                        fp.write('\n')
                    self.log(f'数据获取成功', level=logging.INFO)
                    yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)