Beispiel #1
0
def main():
    bloom = get_updated()
    sum = []
    old_data = []

    for page in range(1, 11):
        data = {
            'currentPage': page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'apparatusProdBackList',
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post',
                                                url=url,
                                                data=data,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        # print(results)
        if len(results) == 1:
            results = reload(page)
            print('跳出reload函数!!!')

        for i in results:
            id = i.get('ZSID')
            company_name = i.get('QYMC_ZW')
            legal_people = i.get('FRMC_ZW')
            company_principal = i.get('QYFZR_ZW')

            recent_record_date = i.get('QFRQ')
            if recent_record_date:
                recent_record_date = get_time(recent_record_date)

            record_mechanism = i.get('FZJG')

            business_range = ""
            fw1 = i.get('CPFW_ZW')
            if fw1:
                r = '【原《分类目录》分类编码区】:' + fw1
                business_range += r
            else:
                r = '【原《分类目录》分类编码区】:无'
                business_range += r

            fw2 = i.get('CPFW_YW')
            if fw2:
                r = '【新《分类目录》分类编码区】:' + fw2
                business_range += r
            else:
                r = '【新《分类目录》分类编码区】:无'
                business_range += r

            home_addr = i.get('ZCDZ_ZW')
            record_num = i.get('ZSBH')

            business_addr_list = i.get('scdzList')
            busi_addr = ""
            if business_addr_list:
                if len(business_addr_list) != 0:
                    for aa in business_addr_list:
                        addr = aa.get('SCDZ')
                        busi_addr += addr

            produce_info = str(i.get('cpxxList'))
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            company_id = get_company_id(company_name)
            if company_id:
                print((id, company_name, legal_people, company_principal,
                       record_num, recent_record_date, record_mechanism,
                       business_range, home_addr, busi_addr, produce_info,
                       company_id))
                if record_num not in bloom:
                    zhilian = Medicine(record_id=id,
                                       company_name=company_name,
                                       legal_people=legal_people,
                                       company_principal=company_principal,
                                       record_num=record_num,
                                       recent_record_date=recent_record_date,
                                       record_mechanism=record_mechanism,
                                       business_range=business_range,
                                       company_id=company_id,
                                       home_addr=home_addr,
                                       busi_addr=busi_addr,
                                       produce_info=produce_info,
                                       gmt_created=times,
                                       gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.record_num == record_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id,
                                       company_name=company_name,
                                       legal_people=legal_people,
                                       company_principal=company_principal,
                                       record_num=record_num,
                                       recent_record_date=recent_record_date,
                                       record_mechanism=record_mechanism,
                                       business_range=business_range,
                                       home_addr=home_addr,
                                       busi_addr=busi_addr,
                                       produce_info=produce_info,
                                       company_id=company_id,
                                       gmt_created=times,
                                       gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((id, company_name, legal_people, company_principal,
                       record_num, recent_record_date, record_mechanism,
                       business_range, home_addr, busi_addr, produce_info))
                if record_num not in bloom:
                    zhilian = Medicine(record_id=id,
                                       company_name=company_name,
                                       legal_people=legal_people,
                                       company_principal=company_principal,
                                       record_num=record_num,
                                       recent_record_date=recent_record_date,
                                       record_mechanism=record_mechanism,
                                       business_range=business_range,
                                       home_addr=home_addr,
                                       busi_addr=busi_addr,
                                       produce_info=produce_info,
                                       gmt_created=times,
                                       gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.record_num == record_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id,
                                       company_name=company_name,
                                       legal_people=legal_people,
                                       company_principal=company_principal,
                                       record_num=record_num,
                                       recent_record_date=recent_record_date,
                                       record_mechanism=record_mechanism,
                                       business_range=business_range,
                                       home_addr=home_addr,
                                       busi_addr=busi_addr,
                                       produce_info=produce_info,
                                       gmt_created=times,
                                       gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)

        time.sleep(8)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(
            len(sum), len(old_data),
            len(sum) - len(old_data)))
        write_db(sum)
def main():
    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1, 6):
        data = {
            'currentPage': page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'durgsList'
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post',
                                                url=url,
                                                data=data,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        for i in results:
            id = i.get('ZSID')
            company_name = i.get('QYMC_ZW')
            area = i.get('ZCDZQX')
            addr = i.get('QYZCDZ')
            street = i.get('ZCDZJD')
            warehouse = i.get('CFDZ')
            legal_people = i.get('FDDBR')
            company_principal = i.get('QYFZR')
            quality_principal = i.get('ZLFZR')
            business_way = i.get('JYFS')
            business_range = i.get('JYFW')
            license_num = i.get('ZSBH')
            license_name = i.get('ZSMC')
            license_mechanism = i.get('FZJG')
            license_status = i.get('ZSZT')

            license_valid_date = i.get('QFRQ')
            if license_valid_date:
                license_valid_date = get_time(license_valid_date)
            license_invalid_date = i.get('YXQZ')
            if license_invalid_date:
                license_invalid_date = get_time(license_invalid_date)

            gsp_license_num = i.get('RZBH')

            gsp_approve_valid_date = i.get('RZSJ')
            if gsp_approve_valid_date:
                gsp_approve_valid_date = get_time(gsp_approve_valid_date)
            gsp_approve_invalid_date = i.get('RZYXQZ')
            if gsp_approve_invalid_date:
                gsp_approve_invalid_date = get_time(gsp_approve_invalid_date)
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            company_id = get_company_id(company_name)
            if company_id:
                print((id, company_name, area, addr, street, warehouse,
                       legal_people, company_principal, quality_principal,
                       business_way, business_range, license_num, license_name,
                       license_mechanism, license_status, license_valid_date,
                       license_invalid_date, gsp_license_num,
                       gsp_approve_valid_date, gsp_approve_invalid_date,
                       company_id))

                if license_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        area=area,
                        addr=addr,
                        street=street,
                        warehouse=warehouse,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        business_way=business_way,
                        business_range=business_range,
                        license_num=license_num,
                        license_name=license_name,
                        license_mechanism=license_mechanism,
                        license_status=license_status,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        gsp_license_num=gsp_license_num,
                        gsp_approve_valid_date=gsp_approve_valid_date,
                        gsp_approve_invalid_date=gsp_approve_invalid_date,
                        company_id=company_id,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.license_num == license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        area=area,
                        addr=addr,
                        street=street,
                        warehouse=warehouse,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        business_way=business_way,
                        business_range=business_range,
                        license_num=license_num,
                        license_name=license_name,
                        license_mechanism=license_mechanism,
                        license_status=license_status,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        gsp_license_num=gsp_license_num,
                        gsp_approve_valid_date=gsp_approve_valid_date,
                        gsp_approve_invalid_date=gsp_approve_invalid_date,
                        company_id=company_id,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((id, company_name, area, addr, street, warehouse,
                       legal_people, company_principal, quality_principal,
                       business_way, business_range, license_num, license_name,
                       license_mechanism, license_status, license_valid_date,
                       license_invalid_date, gsp_license_num,
                       gsp_approve_valid_date, gsp_approve_invalid_date))
                if license_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        area=area,
                        addr=addr,
                        street=street,
                        warehouse=warehouse,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        business_way=business_way,
                        business_range=business_range,
                        license_num=license_num,
                        license_name=license_name,
                        license_mechanism=license_mechanism,
                        license_status=license_status,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        gsp_license_num=gsp_license_num,
                        gsp_approve_valid_date=gsp_approve_valid_date,
                        gsp_approve_invalid_date=gsp_approve_invalid_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.license_num == license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        area=area,
                        addr=addr,
                        street=street,
                        warehouse=warehouse,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        business_way=business_way,
                        business_range=business_range,
                        license_num=license_num,
                        license_name=license_name,
                        license_mechanism=license_mechanism,
                        license_status=license_status,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        gsp_license_num=gsp_license_num,
                        gsp_approve_valid_date=gsp_approve_valid_date,
                        gsp_approve_invalid_date=gsp_approve_invalid_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
        time.sleep(5)
    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(
            len(sum), len(old_data),
            len(sum) - len(old_data)))
        write_db(sum)
def parse(sum):
    all_data = []
    for i in sum:
        file_url = i['file_url']
        file_name = i['file_name']
        response = ""
        for IP in range(10):
            try:
                response = requests.get(url=file_url, headers=headers, proxies=proxys[-1], timeout=10)
                if response.status_code == 200:
                    response = response.content.decode('utf-8')
                    print('break')
                    break
            except Exception:
                dl()
        try:
            key1 = re.findall(r'申请单位/申请人:</span>(.*?)<br/>', response, re.S)
            key2 = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S)
            key3 = re.findall(r'申请单位/申请人:</span>&nbsp;(.*?)</p>', response, re.S)

        except Exception as e:
            print('数据获取失败!!!可能出现404!!!跳过!')
            continue

        if len(key1) != 0 or len(key2) != 0:
            release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0]

            business_water_number = re.findall(r'业务流水号:</span>(.*?)<br/>', response, re.S)
            if len(business_water_number) != 0:
                business_water_number = business_water_number[0]
            else:
                lis = re.findall(r'业务流水号:(.*?)<br/>', response, re.S)
                business_water_number = lis[0]

            company_name = re.findall(r'申请单位/申请人:</span>(.*?)<br/>', response, re.S)
            if len(company_name) != 0:
                company_name = company_name[0]
            else:
                lis = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S)
                company_name = lis[0]

            matter_name = re.findall(r'事项名称:</span>(.*?)<br/>', response, re.S)
            if len(matter_name) != 0:
                matter_name = matter_name[0]
            else:
                lis = re.findall(r'事项名称:(.*?)<br/>', response, re.S)
                matter_name = lis[0]

            project_name = re.findall(r'项目名称:</span>(.*?)<br/>', response, re.S)
            if len(project_name) != 0:
                project_name = project_name[0]
            else:
                lis = re.findall(r'项目名称:(.*?)<br/>', response, re.S)
                project_name = lis[0]

            business_type = re.findall(r'业务类型:</span>(.*?)<br/>', response, re.S)
            if len(business_type) != 0:
                business_type = business_type[0]
            else:
                lis = re.findall(r'业务类型:(.*?)<br/>', response, re.S)
                business_type = lis[0]

            accept_manager_people1 = re.findall(r'受理经办人:</span>(.*?)<br/>', response, re.S)
            accept_manager_people2 = re.findall(r'受理经办人:(.*?)<br/>', response, re.S)
            if len(accept_manager_people1) != 0:
                accept_manager_people = accept_manager_people1[0]
            elif len(accept_manager_people2) != 0:
                accept_manager_people = accept_manager_people2[0]
            else:
                accept_manager_people = 'null'

            current_type = re.findall(r'当前状态:</span>.*?<span .*?">(.*?)</span>', response, re.S)
            if len(current_type) != 0:
                current_type = current_type[0]
            else:
                lis = re.findall(r'当前状态:.*?<span .*?">(.*?)</span>', response, re.S)
                current_type = lis[0]

            approval_date = re.findall(r'<span style="font-size: 18px;">(.*?)</span>', response, re.S)
            if len(approval_date) != 0:
                approval_date = approval_date[-1]
            else:
                approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1]

            data_source = '浙江省文化和旅游厅'

            file_url = file_url.strip()
            file_name = file_name.strip()
            company_name = company_name.strip()
            matter_name = matter_name.strip()
            approval_date = approval_date.strip()
            release_date = release_date.strip()
            business_water_number = business_water_number.strip()
            project_name = project_name.strip()
            business_type = business_type.strip()
            accept_manager_people = accept_manager_people.strip()
            current_type = current_type.strip()
            data_source = data_source.strip()

            administrative_license_matter = 'null'
            province = 'null'
            area = 'null'
            business_position = 'null'
            business_license_number = 'null'
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            company_id = get_company_id(company_name)
            if company_id:
                print((file_url, file_name, company_name, matter_name, approval_date, release_date,
                       business_water_number, project_name,
                       business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                       province, area, business_position,
                       business_license_number, company_id))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times, company_id=company_id)
                all_data.append(zhilian)
            else:
                print((file_url, file_name, company_name, matter_name, approval_date, release_date,
                       business_water_number, project_name,
                       business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                       province, area, business_position,
                       business_license_number))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times)
                all_data.append(zhilian)

        elif len(key3) != 0:
            release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0]

            business_water_number = re.findall(r'业务流水号:</span>(.*?)</span>', response, re.S)
            if len(business_water_number) != 0:
                business_water_number = business_water_number[0]
            else:
                lis = re.findall(r'业务流水号:(.*?)<br/>', response, re.S)
                business_water_number = lis[0]

            company_name = re.findall(r'申请单位/申请人:</span>(.*?)</p>', response, re.S)
            if len(company_name) != 0:
                company_name = company_name[0]
            else:
                lis = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S)
                company_name = lis[0]

            matter_name = re.findall(r'事项名称:</span>(.*?)</p>', response, re.S)
            if len(matter_name) != 0:
                matter_name = matter_name[0]
            else:
                lis = re.findall(r'事项名称:(.*?)<br/>', response, re.S)
                matter_name = lis[0]

            project_name = re.findall(r'项目名称:</span>(.*?)</p>', response, re.S)
            if len(project_name) != 0:
                project_name = project_name[0]
            else:
                lis = re.findall(r'项目名称:(.*?)<br/>', response, re.S)
                project_name = lis[0]

            business_type = re.findall(r'业务类型:</span>(.*?)</p>', response, re.S)
            if len(business_type) != 0:
                business_type = business_type[0]
            else:
                lis = re.findall(r'业务类型:(.*?)<br/>', response, re.S)
                business_type = lis[0]

            accept_manager_people1 = re.findall(r'受理经办人:</span>(.*?)</p>', response, re.S)
            accept_manager_people2 = re.findall(r'受理经办人:(.*?)</p>', response, re.S)
            if len(accept_manager_people1) != 0:
                accept_manager_people = accept_manager_people1[0]
            elif len(accept_manager_people2) != 0:
                accept_manager_people = accept_manager_people2[0]
            else:
                accept_manager_people = 'null'

            current_type = re.findall(r'当前状态:</span>.*?<span .*?">(.*?)</span>', response, re.S)
            if len(current_type) != 0:
                current_type = current_type[0]
            else:
                lis = re.findall(r'当前状态:.*?<span .*?">(.*?)</span>', response, re.S)
                current_type = lis[0]

            approval_date = re.findall(r'浙江省文化厅</p><p .*?">(.*?)</p>', response, re.S)
            if len(approval_date) != 0:
                approval_date = approval_date[0]
            else:
                approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1]

            data_source = '浙江省文化和旅游厅'

            file_url = file_url.strip()
            file_name = file_name.strip()
            company_name = company_name.strip().replace('&nbsp;', '')
            matter_name = matter_name.strip().replace('&nbsp;', '')
            approval_date = approval_date.strip()
            release_date = release_date.strip()
            business_water_number = business_water_number.strip().replace('&nbsp;', '')
            project_name = project_name.strip()
            business_type = business_type.strip()
            accept_manager_people = accept_manager_people.strip()
            current_type = current_type.strip()
            data_source = data_source.strip()

            administrative_license_matter = 'null'
            province = 'null'
            area = 'null'
            business_position = 'null'
            business_license_number = 'null'

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(company_name)
            if company_id:
                print((file_url, file_name, company_name, matter_name, approval_date, release_date,
                       business_water_number, project_name,
                       business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                       province, area, business_position,
                       business_license_number, company_id))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times, company_id=company_id)
                all_data.append(zhilian)
            else:
                print((file_url, file_name, company_name, matter_name, approval_date, release_date,
                       business_water_number, project_name,
                       business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                       province, area, business_position,
                       business_license_number))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times)
                all_data.append(zhilian)

        else:
            release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0]
            administrative_license_matter = re.findall(r'行政许可事项:(.*?)<br/>', response, re.S)
            if len(administrative_license_matter) != 0:
                administrative_license_matter = administrative_license_matter[0]
            else:
                lis = re.findall(r'行政许可事项:(.*?)<br />', response, re.S)
                administrative_license_matter = lis[0]

            company_name = re.findall(r'单位名称:(.*?)<br/>', response, re.S)
            if len(company_name) != 0:
                company_name = company_name[0]
            else:
                lis = re.findall(r'单位名称:(.*?)<br />', response, re.S)
                company_name = lis[0]
            province = re.findall(r'省份:(.*?)<br/>', response, re.S)
            if len(province) != 0:
                province = province[0]
            else:
                lis = re.findall(r'省份:(.*?)<br />', response, re.S)
                province = lis[0]
            area = re.findall(r'地市:(.*?)<br/>', response, re.S)
            if len(area) != 0:
                area = area[0]
            else:
                lis = re.findall(r'地市:(.*?)<br />', response, re.S)
                area = lis[0]
            business_position = re.findall(r'营业场所:(.*?)<br/>', response, re.S)
            if len(business_position) != 0:
                business_position = business_position[0]
            else:
                lis = re.findall(r'营业场所:(.*?)<br />', response, re.S)
                business_position = lis[0]
            business_license_number = re.findall(r'营业许可证号:(.*?)<br/>', response, re.S)
            if len(business_license_number) != 0:
                business_license_number = business_license_number[0]
            else:
                lis = re.findall(r'营业许可证号:(.*?)<br />', response, re.S)
                business_license_number = lis[0]
            current_type = re.findall(r'当前状态:<span .*?">(.*?)</span>', response, re.S)[0]
            approval_date = re.findall(r'<span style="font-size: 18px;">(.*?)</span>', response, re.S)
            if len(approval_date) != 0:
                approval_date = approval_date[-1]
            else:
                approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1]
            data_source = '浙江省文化和旅游厅'

            file_url = file_url.strip()
            file_name = file_name.strip()
            company_name = company_name.strip()
            matter_name = 'null'
            approval_date = approval_date.strip()
            release_date = release_date.strip()
            business_water_number = 'null'
            project_name = 'null'
            business_type = 'null'
            accept_manager_people = 'null'
            current_type = current_type.strip()
            data_source = data_source.strip()

            administrative_license_matter = administrative_license_matter.strip()
            province = province.strip()
            area = area.strip()
            business_position = business_position.strip()
            business_license_number = business_license_number.strip()
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            company_id = get_company_id(company_name)
            if company_id:
                print(
                    (file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number,
                     project_name,
                     business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                     province,
                     area, business_position,
                     business_license_number, company_id))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times, company_id=company_id)
                all_data.append(zhilian)
            else:
                print(
                    (file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number,
                     project_name,
                     business_type, accept_manager_people, current_type, data_source, administrative_license_matter,
                     province,
                     area, business_position,
                     business_license_number))
                zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name,
                                   approval_date=approval_date, release_date=release_date,
                                   business_water_number=business_water_number, project_name=project_name,
                                   business_type=business_type,
                                   accept_manager_people=accept_manager_people, current_type=current_type,
                                   data_source=data_source,
                                   administrative_license_matter=administrative_license_matter,
                                   province=province, area=area, business_position=business_position,
                                   business_license_number=business_license_number,
                                   gmt_created=times, gmt_updated=times)
                all_data.append(zhilian)
        time.sleep(0.8)

    write_db(all_data)
Beispiel #4
0
def main():
    bloom = get_updated()
    sum = []
    data = {
        '_currpage': 1,
        '_pagelines': 20,
        '_rowcount': 56,
        '_selectpage': 1
    }
    url = "http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/queryLwpqxkdw.action;jsessionid=5pN8HiGEIhh85GYcbqG_ldLunPVxbSP_viiSnSz35dxeS8LGMRY5!1300760266"
    response_web = ""
    for IP in range(10):
        try:
            response_web = requests.request(method='post',
                                            url=url,
                                            headers=headers,
                                            data=data,
                                            proxies=proxys[-1],
                                            timeout=15)
            # print(response)
            if response_web.status_code == 200:
                response_web = response_web.content.decode('utf8')
                break
            time.sleep(2)
        except Exception:
            dl()
            time.sleep(1.5)
    results = int(
        re.findall(r"<font color='red'>(.*?)</font>页", response_web, re.S)[-1])
    for page in range(1, results + 1):
        data = {
            '_currpage': page,
            '_pagelines': 20,
            '_rowcount': 56,
            '_selectpage': page
        }
        url = "http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/queryLwpqxkdw.action;jsessionid=5pN8HiGEIhh85GYcbqG_ldLunPVxbSP_viiSnSz35dxeS8LGMRY5!1300760266"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post',
                                                url=url,
                                                headers=headers,
                                                data=data,
                                                proxies=proxys[-1],
                                                timeout=15)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
                time.sleep(2)
            except Exception:
                dl()
                time.sleep(1.5)

        results = re.findall(r'javascript:toGetInfo(.*?);', response_web, re.S)
        print(results)
        for j in results:
            keyword = j.split('(')[-1].split(')')[0]
            url = 'http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/checkLwpqxkdw.action?dwid={}'.format(
                keyword)
            response = ""
            for IP in range(10):
                try:
                    response = requests.request(method='post',
                                                url=url,
                                                headers=headers,
                                                data=data,
                                                proxies=proxys[-1],
                                                timeout=15)
                    # print(response)
                    if response.status_code == 200:
                        response = response.content.decode('utf8')
                        break
                    time.sleep(2)
                except Exception:
                    dl()
                    time.sleep(1.5)
            company_name = re.findall(r'<h4>(.*?)</h4>', response, re.S)[0]
            register_position = re.findall(r'注册地址:.*?<td>(.*?)&nbsp;</td>',
                                           response, re.S)[0]
            connect_people = re.findall(r'联系人:.*?<td >(.*?)&nbsp;</td>',
                                        response, re.S)[0]
            connect_tel = re.findall(r'联系电话:.*?<td>(.*?)&nbsp;</td>', response,
                                     re.S)[0]
            business_license_number = re.findall(
                r'劳务派遣经营许可证号码:.*?<td >(.*?)&nbsp;</td>', response, re.S)[0]
            license_authority = re.findall(r'许可机关.*?<td >(.*?)&nbsp;</td>',
                                           response, re.S)[0]
            valid_date = re.findall(r'有效期:.*?<td >(.*?)&nbsp;</td>', response,
                                    re.S)[0]
            year_business_report = re.findall(
                r'提交年度经营报告情况.*?<td >(.*?)&nbsp;</td>', response, re.S)[0]

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if business_license_number not in bloom:

                company_id = get_company_id(company_name)
                if company_id:
                    print((company_name, register_position, connect_people,
                           connect_tel, business_license_number,
                           license_authority, valid_date, year_business_report,
                           company_id))
                    zhilian = Medicine(
                        company_name=company_name,
                        register_position=register_position,
                        connect_people=connect_people,
                        connect_tel=connect_tel,
                        business_license_number=business_license_number,
                        license_authority=license_authority,
                        valid_date=valid_date,
                        year_business_report=year_business_report,
                        gmt_created=times,
                        gmt_updated=times,
                        company_id=company_id)

                    sum.append(zhilian)
                else:
                    print(
                        (company_name, register_position, connect_people,
                         connect_tel, business_license_number,
                         license_authority, valid_date, year_business_report))
                    zhilian = Medicine(
                        company_name=company_name,
                        register_position=register_position,
                        connect_people=connect_people,
                        connect_tel=connect_tel,
                        business_license_number=business_license_number,
                        license_authority=license_authority,
                        valid_date=valid_date,
                        year_business_report=year_business_report,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)

            time.sleep(0.5)

    if len(sum) == 0:
        print('此次没有数据更新!!!')
    else:
        print('此次更新数据有{}条!!!'.format(len(sum)))
        write_db(sum)
def main():
    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1,3):
        data = {
            'currentPage':page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'apparatusWTProdList',
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post', url=url, data = data,headers=headers,proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        if len(results) == 1:
            results = reload(page)
            print('跳出reload函数!!!')

        for i in results:
            id = i.get('CPZCH_BAH')
            entrust_company_name =i.get('QYMC')
            entrust_license_num = i.get('XKZBH')
            entrust_legal = i.get('FDDBR')
            entrust_company_principal = i.get('QYFZR')
            entrust_addr = i.get('QYZCDZ')
            entrust_produce_addr = i.get('SCDZ')
            be_entrust_company_name = i.get('SWTQYMC')
            be_entrust_license_num = i.get('SWTQYSCXKZ')
            be_entrust_legal = i.get('SWTFDDBR')
            be_entrust_company_principal = i.get('SWTQYFZR')
            be_entrust_addr = i.get('SWTQYZCDZ')
            be_entrust_produce_addr = i.get('SWTQYSCDZ')
            entrust_product_name = i.get('CPMC')
            product_license_num = i.get('CPZCH_BAH')
            entrust_date = i.get('WTQXRQ')
            record_date = i.get('QFRQ')
            if record_date:
                record_date = get_time(record_date)
            if entrust_date:
                entrust_date = get_time(entrust_date)
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(entrust_company_name)
            if company_id:
                print((id, entrust_company_name, entrust_license_num, entrust_legal, entrust_company_principal,
                       entrust_addr, entrust_produce_addr,
                       be_entrust_company_name, be_entrust_license_num, be_entrust_legal, be_entrust_company_principal,
                       be_entrust_addr,
                       be_entrust_produce_addr, entrust_product_name, product_license_num, record_date, entrust_date,company_id
                       ))
                if product_license_num not in bloom:
                    zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name,
                                       entrust_license_num=entrust_license_num, entrust_legal=entrust_legal,
                                       entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr,
                                       entrust_produce_addr=entrust_produce_addr,
                                       be_entrust_company_name=be_entrust_company_name,
                                       be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal,
                                       be_entrust_company_principal=be_entrust_company_principal,
                                       be_entrust_addr=be_entrust_addr,
                                       be_entrust_produce_addr=be_entrust_produce_addr,
                                       entrust_product_name=entrust_product_name,
                                       product_license_num=product_license_num,
                                       record_date=record_date, entrust_date=entrust_date, gmt_created=times,company_id=company_id,
                                       gmt_updated=times)

                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(Medicine.product_license_num == product_license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name,
                                       entrust_license_num=entrust_license_num, entrust_legal=entrust_legal,
                                       entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr,
                                       entrust_produce_addr=entrust_produce_addr,
                                       be_entrust_company_name=be_entrust_company_name,
                                       be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal,
                                       be_entrust_company_principal=be_entrust_company_principal,
                                       be_entrust_addr=be_entrust_addr,
                                       be_entrust_produce_addr=be_entrust_produce_addr,
                                       entrust_product_name=entrust_product_name,
                                       product_license_num=product_license_num,
                                       record_date=record_date, entrust_date=entrust_date, gmt_created=times,
                                       company_id=company_id,
                                       gmt_updated=times)

                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((id, entrust_company_name, entrust_license_num, entrust_legal, entrust_company_principal,
                       entrust_addr, entrust_produce_addr,
                       be_entrust_company_name, be_entrust_license_num, be_entrust_legal, be_entrust_company_principal,
                       be_entrust_addr,
                       be_entrust_produce_addr, entrust_product_name, product_license_num, record_date, entrust_date,
                       ))
                if product_license_num not in bloom:
                    zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name,
                                       entrust_license_num=entrust_license_num, entrust_legal=entrust_legal,
                                       entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr,
                                       entrust_produce_addr=entrust_produce_addr,
                                       be_entrust_company_name=be_entrust_company_name,
                                       be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal,
                                       be_entrust_company_principal=be_entrust_company_principal,
                                       be_entrust_addr=be_entrust_addr,
                                       be_entrust_produce_addr=be_entrust_produce_addr,
                                       entrust_product_name=entrust_product_name,
                                       product_license_num=product_license_num,
                                       record_date=record_date, entrust_date=entrust_date, gmt_created=times,
                                       gmt_updated=times)

                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.product_license_num == product_license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name,
                                       entrust_license_num=entrust_license_num, entrust_legal=entrust_legal,
                                       entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr,
                                       entrust_produce_addr=entrust_produce_addr,
                                       be_entrust_company_name=be_entrust_company_name,
                                       be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal,
                                       be_entrust_company_principal=be_entrust_company_principal,
                                       be_entrust_addr=be_entrust_addr,
                                       be_entrust_produce_addr=be_entrust_produce_addr,
                                       entrust_product_name=entrust_product_name,
                                       product_license_num=product_license_num,
                                       record_date=record_date, entrust_date=entrust_date, gmt_created=times,
                                       gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)

        time.sleep(5)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(len(sum),len(old_data),len(sum)-len(old_data)))
        write_db(sum)
Beispiel #6
0
def main():
    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1, 6):
        data = {
            'currentPage': page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'apparatusDealList',
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post',
                                                url=url,
                                                data=data,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=20)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        # print(results)
        if len(results) == 1:
            results = reload(page)
            print('跳出reload函数!!!')

        for i in results:
            id = i.get('ZSID')
            area = i.get('JYDZQX')
            company_name = i.get('QYMC_ZW')
            license_name = i.get('ZSMC')
            license_num = i.get('ZSBH')
            business_type = i.get('JYMS')
            supervision_classification = i.get('FLJGJB')
            supervision_sort = i.get('ZDJGLB')
            legal_people = i.get('FDDBR')
            company_principal = i.get('QYFZR')
            addr = i.get('QYZSDZ')
            business_addr = i.get('JYCS')
            warehouse_addr = i.get('CFDZ')
            business_way = i.get('JYFS')

            third_business_range = ""
            fw1 = i.get('JYFW')
            if fw1:
                r = '【原《分类目录》分类编码区】:' + fw1
                third_business_range += r
            else:
                r = '【原《分类目录》分类编码区】:无'
                third_business_range += r

            fw2 = i.get('CPFW_YW')
            if fw2:
                r = '【新《分类目录》分类编码区】:' + fw2
                third_business_range += r
            else:
                r = '【新《分类目录》分类编码区】:无'
                third_business_range += r

            license_valid_date = i.get('QFRQ')
            if license_valid_date:
                license_valid_date = get_time(license_valid_date)
            license_invalid_date = i.get('YXQZ')
            if license_invalid_date:
                license_invalid_date = get_time(license_invalid_date)
            two_business_range = i.get('BAJYFW')
            record_date = i.get('BAQFRQ')
            if record_date:
                record_date = get_time(record_date)
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            company_id = get_company_id(company_name)
            if company_id:
                print((id, area, company_name, license_name, license_num,
                       business_type, supervision_classification,
                       supervision_sort, legal_people, company_principal, addr,
                       business_addr, warehouse_addr, business_way,
                       third_business_range, license_valid_date,
                       license_invalid_date, two_business_range, record_date,
                       company_id))
                if license_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        area=area,
                        company_name=company_name,
                        license_name=license_name,
                        license_num=license_num,
                        business_type=business_type,
                        supervision_classification=supervision_classification,
                        supervision_sort=supervision_sort,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        addr=addr,
                        business_addr=business_addr,
                        warehouse_addr=warehouse_addr,
                        business_way=business_way,
                        third_business_range=third_business_range,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        two_business_range=two_business_range,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times,
                        company_id=company_id)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.license_num == license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        area=area,
                        company_name=company_name,
                        license_name=license_name,
                        license_num=license_num,
                        business_type=business_type,
                        supervision_classification=supervision_classification,
                        supervision_sort=supervision_sort,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        addr=addr,
                        business_addr=business_addr,
                        warehouse_addr=warehouse_addr,
                        business_way=business_way,
                        third_business_range=third_business_range,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        two_business_range=two_business_range,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times,
                        company_id=company_id)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((id, area, company_name, license_name, license_num,
                       business_type, supervision_classification,
                       supervision_sort, legal_people, company_principal, addr,
                       business_addr, warehouse_addr, business_way,
                       third_business_range, license_valid_date,
                       license_invalid_date, two_business_range, record_date))
                if license_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        area=area,
                        company_name=company_name,
                        license_name=license_name,
                        license_num=license_num,
                        business_type=business_type,
                        supervision_classification=supervision_classification,
                        supervision_sort=supervision_sort,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        addr=addr,
                        business_addr=business_addr,
                        warehouse_addr=warehouse_addr,
                        business_way=business_way,
                        third_business_range=third_business_range,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        two_business_range=two_business_range,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.license_num == license_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        area=area,
                        company_name=company_name,
                        license_name=license_name,
                        license_num=license_num,
                        business_type=business_type,
                        supervision_classification=supervision_classification,
                        supervision_sort=supervision_sort,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        addr=addr,
                        business_addr=business_addr,
                        warehouse_addr=warehouse_addr,
                        business_way=business_way,
                        third_business_range=third_business_range,
                        license_valid_date=license_valid_date,
                        license_invalid_date=license_invalid_date,
                        two_business_range=two_business_range,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)

        time.sleep(8)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(
            len(sum), len(old_data),
            len(sum) - len(old_data)))

        write_db(sum)
Beispiel #7
0
def main():
    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1,3):
        data = {
            'currentPage': page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'drugProductList',
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        # print(results)
        if len(results) == 1:
            results = reload(page)
            print('跳出reload函数!!!')

        for i in results:
            id = i.get('ZSBH')
            company_name = i.get('QYMC_ZW')
            license_num = i.get('ZSBH')
            classifi_num =  i.get('CPFWLB')
            social_credit_code = i.get('SHXYDM')
            license_invalid_date = i.get('YXQZ')
            recent_update_date = i.get('QFRQ')

            legal_people = i.get('FRMC_ZW')
            company_principal = i.get('QYFZR_ZW')
            register_addr = i.get('ZCDZ_ZW')
            produce_addr_and_range = i.get('CPFW_ZW')

            quality_principal = i.get('ZLFZR')
            produce_principal = i.get('SCFZR')
            quality_authorization = i.get('ZLSQR')
            license_type = i.get('ZSZT')
            if license_type == '10':
                license_type = '有效'
            else:
                license_type = '注销'

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(company_name)
            if company_id:
                print((id, company_name, license_num, classifi_num, social_credit_code, license_invalid_date,
                       recent_update_date, legal_people, company_principal,
                       register_addr, produce_addr_and_range, quality_principal, produce_principal,
                       quality_authorization, license_type,company_id
                       ))
                if social_credit_code not in bloom:
                    zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num,
                                       legal_people=legal_people, company_principal=company_principal,
                                       license_invalid_date=license_invalid_date, recent_update_date=recent_update_date,
                                       register_addr=register_addr,
                                       classifi_num=classifi_num, social_credit_code=social_credit_code,
                                       produce_addr_and_range=produce_addr_and_range,
                                       quality_principal=quality_principal, produce_principal=produce_principal,
                                       quality_authorization=quality_authorization,company_id=company_id,
                                       license_type=license_type, gmt_created=times, gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(Medicine.social_credit_code == social_credit_code).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num,
                                       legal_people=legal_people, company_principal=company_principal,
                                       license_invalid_date=license_invalid_date, recent_update_date=recent_update_date,
                                       register_addr=register_addr,
                                       classifi_num=classifi_num, social_credit_code=social_credit_code,
                                       produce_addr_and_range=produce_addr_and_range,
                                       quality_principal=quality_principal, produce_principal=produce_principal,
                                       quality_authorization=quality_authorization, company_id=company_id,
                                       license_type=license_type, gmt_created=times, gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((id, company_name, license_num, classifi_num, social_credit_code, license_invalid_date,
                       recent_update_date, legal_people, company_principal,
                       register_addr, produce_addr_and_range, quality_principal, produce_principal,
                       quality_authorization, license_type,
                       ))
                if social_credit_code not in bloom:
                    zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num,
                                       legal_people=legal_people, company_principal=company_principal,
                                       license_invalid_date=license_invalid_date, recent_update_date=recent_update_date,
                                       register_addr=register_addr,
                                       classifi_num=classifi_num, social_credit_code=social_credit_code,
                                       produce_addr_and_range=produce_addr_and_range,
                                       quality_principal=quality_principal, produce_principal=produce_principal,
                                       quality_authorization=quality_authorization,
                                       license_type=license_type, gmt_created=times, gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(Medicine.social_credit_code == social_credit_code).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num,
                                       legal_people=legal_people, company_principal=company_principal,
                                       license_invalid_date=license_invalid_date, recent_update_date=recent_update_date,
                                       register_addr=register_addr,
                                       classifi_num=classifi_num, social_credit_code=social_credit_code,
                                       produce_addr_and_range=produce_addr_and_range,
                                       quality_principal=quality_principal, produce_principal=produce_principal,
                                       quality_authorization=quality_authorization,
                                       license_type=license_type, gmt_created=times, gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)

        time.sleep(8)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(len(sum),len(old_data),len(sum)-len(old_data)))
        write_db(sum)
Beispiel #8
0
def main():
    bloom = get_updated()
    sum = []
    data = {
        'currentPage': '1',
        'pageSize': '10',
        'groupSize': '8',
        'pageName': 'drugProductList-Time-Limit'
    }
    url = "http://xuke.smda.sh.cn/AppRoveManage/selectLicense/selectData"
    response_web = ""
    for IP in range(10):
        try:
            response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1],
                                            timeout=10)
            # print(response)
            if response_web.status_code == 200:
                response_web = response_web.content.decode('utf8')
                break
        except Exception:
            dl()
    response = json.loads(response_web)
    pages = int(response.get('totalPage'))
    print('数据总数:{}页!!!'.format(pages))
    time.sleep(3)
    for page in range(1,pages+1):
        data = {
            'currentPage':page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'drugProductList-Time-Limit'
        }
        url = "http://xuke.smda.sh.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post', url=url, data = data,headers=headers,proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        for i in results:
            id = i.get('ZSBH')
            show_time = i.get('RESERVED4')
            license_num = i.get('ZSBH')
            company_name = i.get('QYMC_ZW')
            classifi_code = i.get('CPFWLB')
            social_credit_code = i.get('SHXYDM')
            addr = i.get('ZCDZ_ZW')
            legal_people = i.get('FRMC_ZW')
            company_principal = i.get('QYFZR_ZW')
            quality_principal = i.get('ZLFZR')
            produce_principal = i.get('SCFZR')
            quality_authorize = i.get('ZLSQR')
            produce_addr_range = i.get('CPFW_ZW')

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(company_name)
            if company_id:
                print((id,show_time,license_num,company_name,classifi_code,social_credit_code,addr,legal_people,company_principal,quality_principal,
                       produce_principal,quality_principal,quality_authorize,produce_addr_range,company_id))
                if id not in bloom:
                    zhilian = Medicine(record_id=id,
                                       show_time=show_time, license_num=license_num, company_name=company_name, classifi_code=classifi_code,
                                       social_credit_code=social_credit_code, addr=addr, legal_people=legal_people,company_principal=company_principal,
                                       quality_principal=quality_principal,produce_principal=produce_principal,
                                       quality_authorize=quality_authorize, produce_addr_range=produce_addr_range,
                                       gmt_created=times, gmt_updated=times,company_id=company_id)
                    sum.append(zhilian)
            else:
                print((id, show_time, license_num, company_name, classifi_code, social_credit_code, addr, legal_people,
                       company_principal, quality_principal,
                       produce_principal, quality_principal, quality_authorize, produce_addr_range))

                if id not in bloom:
                    zhilian = Medicine(record_id=id,
                                       show_time=show_time, license_num=license_num, company_name=company_name,
                                       classifi_code=classifi_code,
                                       social_credit_code=social_credit_code, addr=addr, legal_people=legal_people,
                                       company_principal=company_principal,
                                       quality_principal=quality_principal, produce_principal=produce_principal,
                                       quality_authorize=quality_authorize, produce_addr_range=produce_addr_range,
                                       gmt_created=times, gmt_updated=times)
                    sum.append(zhilian)
        time.sleep(5)
    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('本地数据更新了{}条!!!'.format(len(sum)))
        write_db(sum)
Beispiel #9
0
    'spider_add_value_telecom_info', 'spider_company_city_level_makerspace',
    'spider_company_honor_data', 'spider_company_province_level_makerspace',
    'spider_company_province_tech_incubator',
    'spider_culture_business_license', 'spider_industry_information',
    'spider_outstand_talent', 'spider_radio_show_business_license',
    'spider_service_license', 'spider_high_talent', 'spider_talent_room'
]
# ,'spider_company_related_park']
for table in all_tables:
    sql = '''select id,company_name from {} where company_id is NULL '''.format(
        table)
    cursor.execute(sql)
    single_data = cursor.fetchall()
    print(single_data)
    for i in single_data:
        id = i[0]
        company_name = i[1]
        company_id = get_company_id(company_name)
        if company_id:
            update_sql = '''update {} set company_id = {} where id = {}'''.format(
                table, company_id, id)
            cursor.execute(update_sql)
            print('{}表中,公司{}新增了company_id字段'.format(table, company_name))
        else:
            continue
        # time.sleep(0.5)
    # break

conn.commit()
conn.close()
def main():
    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1, 3):
        data = {
            'currentPage': page,
            'pageSize': '10',
            'groupSize': '8',
            'pageName': 'ylqxwljyList',
        }
        url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData"
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='post',
                                                url=url,
                                                data=data,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=10)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        response = json.loads(response_web)
        results = response.get('rowData')
        # print(results)
        if len(results) == 1:
            results = reload(page)
            print('跳出reload函数!!!')
        for i in results:
            id = i.get('ZSID')

            record_num = i.get('ZSBH')
            company_name = i.get('QYMC_ZW')
            addr = i.get('ZCDZ_ZW')
            business_addr = i.get('SCDZ_ZW')
            legal_people = i.get('FRMC_ZW')
            company_principal = i.get('QYFZR_ZW')
            quality_principal = i.get('ZLAQGLR')
            web_name = i.get('WZMC')
            web_program_name = i.get('WLKHD')
            domain_name = i.get('WZYM')
            ip = i.get('WZIPDZ')
            service_machine_addr = i.get('FWQCFDZ')
            non_profit_internet_service_record_num = i.get('FJYXHLWXXBABH')
            record_mechanism = i.get('FZJG')
            record_date = i.get('QFRQ')
            if record_date:
                record_date = get_time(record_date)

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(company_name)
            if company_id:
                print((id, record_num, company_name, addr, business_addr,
                       legal_people, company_principal, quality_principal,
                       web_name, web_program_name, domain_name, ip,
                       service_machine_addr,
                       non_profit_internet_service_record_num,
                       record_mechanism, record_date, company_id))
                if record_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        record_num=record_num,
                        company_name=company_name,
                        addr=addr,
                        business_addr=business_addr,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        web_name=web_name,
                        web_program_name=web_program_name,
                        domain_name=domain_name,
                        ip=ip,
                        service_machine_addr=service_machine_addr,
                        non_profit_internet_service_record_num=
                        non_profit_internet_service_record_num,
                        record_mechanism=record_mechanism,
                        record_date=record_date,
                        gmt_created=times,
                        company_id=company_id,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.record_num == record_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        record_num=record_num,
                        company_name=company_name,
                        addr=addr,
                        business_addr=business_addr,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        web_name=web_name,
                        web_program_name=web_program_name,
                        domain_name=domain_name,
                        ip=ip,
                        service_machine_addr=service_machine_addr,
                        non_profit_internet_service_record_num=
                        non_profit_internet_service_record_num,
                        record_mechanism=record_mechanism,
                        record_date=record_date,
                        gmt_created=times,
                        company_id=company_id,
                        gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            else:
                print((
                    id,
                    record_num,
                    company_name,
                    addr,
                    business_addr,
                    legal_people,
                    company_principal,
                    quality_principal,
                    web_name,
                    web_program_name,
                    domain_name,
                    ip,
                    service_machine_addr,
                    non_profit_internet_service_record_num,
                    record_mechanism,
                    record_date,
                ))
                if record_num not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        record_num=record_num,
                        company_name=company_name,
                        addr=addr,
                        business_addr=business_addr,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        web_name=web_name,
                        web_program_name=web_program_name,
                        domain_name=domain_name,
                        ip=ip,
                        service_machine_addr=service_machine_addr,
                        non_profit_internet_service_record_num=
                        non_profit_internet_service_record_num,
                        record_mechanism=record_mechanism,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.record_num == record_num).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        record_num=record_num,
                        company_name=company_name,
                        addr=addr,
                        business_addr=business_addr,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        quality_principal=quality_principal,
                        web_name=web_name,
                        web_program_name=web_program_name,
                        domain_name=domain_name,
                        ip=ip,
                        service_machine_addr=service_machine_addr,
                        non_profit_internet_service_record_num=
                        non_profit_internet_service_record_num,
                        record_mechanism=record_mechanism,
                        record_date=record_date,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                    old_data.append(zhilian)
        time.sleep(8)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(
            len(sum), len(old_data),
            len(sum) - len(old_data)))
        write_db(sum)
def main():
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    }
    sum = []
    for page in range(1500, 2098):
        url = 'https://zwfw.miit.gov.cn/miit/resultSearch?wd=&categoryTreeId=302&categoryTreePid=&pagenow={}'.format(
            page)
        response = ""
        for IP in range(20):
            try:
                response = requests.request(method='get',
                                            url=url,
                                            headers=headers,
                                            proxies=proxys[-1],
                                            timeout=10)
                if response.status_code == 200:
                    response = response.content.decode('utf8')
                    print('获取信息成功!!!')
                    print('break!!!')
                    break
            except Exception:
                dl()
        tree = etree.HTML(response)
        element_list = tree.xpath(
            '//table[@class="table table-bordered table-responsive"]/tbody/tr')
        for ele in element_list:
            permit_number = ele.xpath('./td[2]/@title')[0]
            company_name = ele.xpath('./td[3]/@title')[0]
            busi_type = ele.xpath('./td[4]/@title')[0]
            busi_range = ele.xpath('./td[5]/@title')[0]
            invalid_date = ele.xpath('./td[6]/@title')[0]

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            company_id = get_company_id(company_name)
            print(permit_number, company_name, busi_type, busi_range,
                  invalid_date)

            if company_id:
                zhilian = Medicine(permit_number=permit_number,
                                   company_name=company_name,
                                   busi_type=busi_type,
                                   busi_range=busi_range,
                                   invalid_date=invalid_date,
                                   gmt_created=times,
                                   gmt_updated=times,
                                   company_id=company_id)
                sum.append(zhilian)
            else:
                zhilian = Medicine(permit_number=permit_number,
                                   company_name=company_name,
                                   busi_type=busi_type,
                                   busi_range=busi_range,
                                   invalid_date=invalid_date,
                                   gmt_created=times,
                                   gmt_updated=times)
                sum.append(zhilian)
        time.sleep(2)

    write_db(sum)
Beispiel #12
0
def parse(data):
    bloom = get_updated()
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        'If-None-Natch': '',
        'If-Modified-Since': ''
    }
    f = os.path.join(file_path, 'files')
    shutil.rmtree(f)
    if not os.path.exists(f):
        os.mkdir(f)
    sum = []
    for i in data:
        url = i.get('download_file_url')
        name = i.get('file_name')
        if url.endswith('xls'):
            file_name = '{}/{}.xls'.format(f, name)
        else:
            file_name = '{}/{}.xlsx'.format(f, name)

        response = requests.request(method='get', url=url,
                                    headers=headers).content
        with open(file_name, 'wb') as fP:
            fP.write(response)
    for i in data:
        url = i.get('download_file_url')
        name = i.get('file_name')
        if url.endswith('xls'):
            file_name = '{}/{}.xls'.format(f, name)
        else:
            file_name = '{}/{}.xlsx'.format(f, name)

        workbook = open_workbook(file_name)  # 打开excel文件
        sheet2 = workbook.sheet_by_index(0)
        all_rows = sheet2.row_values(1)
        for index, i in enumerate(all_rows):
            if i.endswith('称'):
                num = index
                break
        for i in range(2, sheet2.nrows):
            permit_number = sheet2.cell(i, 0).value.strip()
            if '不予' in permit_number and '浙' in permit_number:
                permit_number = ''.join(permit_number.split('不予许可')).strip()
                permit_number = ''.join(permit_number.split('不予受理')).strip()
            if permit_number in bloom:
                print(permit_number, name)

                continue

            company_name = sheet2.cell(i, num).value.strip()

            headers = {
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Connection': 'keep-alive',
                'Content-Length': '332',
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8',
                'Cookie':
                'asopSearchUserName=C0E6C927-A7FA-F8E8-197C-339630838038; lastAccessTime=1599630838038; JSESSIONID=5E36F6AD6962754A4EF96AE3E5BBE0BE; lastLoginTime=1599634000115',
                'Host': '202.106.121.52:8580',
                'Origin': 'http://202.106.121.52:8580',
                'Referer': 'http://202.106.121.52:8580/searchweb/query.jsp',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest'
            }
            post_url = "http://202.106.121.52:8580/searchweb/search"
            data = {
                'pageSize': '10',
                'pageNow': '1',
                'zjbh': permit_number,
                'sortType': '0',
                'searchType': '0',
                'titleFoldBegin': '-1',
                'titleFoldPage': '-1',
                'urls': 'zjca.miit.gov.cn/n477169/n477283/'  # 关键字段
            }
            response = json.loads(
                requests.post(post_url,
                              data=data,
                              headers=headers,
                              proxies=proxys[-1],
                              timeout=10).text).get('array')

            if not response:
                times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                print('公司数据目前查询不到!!!')
                print(company_name, permit_number, name)
                zhilian = Medicine(company_name=company_name,
                                   permit_number=permit_number,
                                   file_name=name,
                                   gmt_created=times,
                                   gmt_updated=times)
                sum.append(zhilian)
                continue

            detail_url = [i.get('url') for i in response][0]
            head = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
                'If-None-Natch': '',
                'If-Modified-Since': ''
            }
            response = ""
            for IP in range(10):
                try:
                    response = requests.request(
                        method='get',
                        url=detail_url,
                        headers=head,
                        proxies=proxys[-1],
                        timeout=10,
                    )
                    if response.status_code == 200:
                        response = response.content.decode('utf-8')
                        print('获取信息成功!!!')
                        print('break!!!')
                        break
                except Exception:
                    dl()
            tree = etree.HTML(response)
            element_list = tree.xpath('//table[@class="table_biaoge"]/tbody')
            for ele in element_list:
                permit_number = ele.xpath('./tr[2]/td[2]/a/text()')[0].strip()
                company_name = ele.xpath('./tr[3]/td[2]/a/text()')[0].strip()
                domain_info = ele.xpath('./tr[4]/td[2]/a/text()')[0]
                if '$content' in domain_info:
                    domain_info = '数据显示错误'
                business_type = ele.xpath('./tr[5]/td[2]/a/text()')[0]
                customer_service_tel = ele.xpath('./tr[6]/td[2]/a/text()')[0]
                if '$content' in customer_service_tel:
                    customer_service_tel = '数据显示错误'
                certificate_valid_date = ele.xpath('./tr[7]/td[2]/a/text()')[0]
                certificate_invalid_date = ele.xpath(
                    './tr[8]/td[2]/a/text()')[0]

                times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                company_id = get_company_id(company_name)
                if company_id:
                    print((permit_number, company_name, domain_info,
                           business_type, customer_service_tel,
                           certificate_valid_date, certificate_invalid_date,
                           name, detail_url, company_id))
                    zhilian = Medicine(
                        company_name=company_name,
                        permit_number=permit_number,
                        domain_info=domain_info,
                        business_type=business_type,
                        customer_service_tel=customer_service_tel,
                        certificate_valid_date=certificate_valid_date,
                        certificate_invalid_date=certificate_invalid_date,
                        file_name=name,
                        company_id=company_id,
                        company_url=detail_url,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)
                else:
                    print((permit_number, company_name, domain_info,
                           business_type, customer_service_tel,
                           certificate_valid_date, certificate_invalid_date,
                           name, detail_url))
                    zhilian = Medicine(
                        company_name=company_name,
                        permit_number=permit_number,
                        domain_info=domain_info,
                        business_type=business_type,
                        customer_service_tel=customer_service_tel,
                        certificate_valid_date=certificate_valid_date,
                        certificate_invalid_date=certificate_invalid_date,
                        file_name=name,
                        company_url=detail_url,
                        gmt_created=times,
                        gmt_updated=times)
                    sum.append(zhilian)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('本地数据更新了{}条!!!'.format(len(sum)))
        write_db(sum)
Beispiel #13
0
def post_data(sum):
    object_lists = []
    for company_name in sum:
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '332',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie':
            'asopSearchUserName=C0E6C927-A7FA-F8E8-197C-339630838038; lastAccessTime=1599630838038; JSESSIONID=5E36F6AD6962754A4EF96AE3E5BBE0BE; lastLoginTime=1599634000115',
            'Host': '202.106.121.52:8580',
            'Origin': 'http://202.106.121.52:8580',
            'Referer': 'http://202.106.121.52:8580/searchweb/query.jsp',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        post_url = "http://202.106.121.52:8580/searchweb/search"
        data = {
            'fullText': company_name,
            'pageSize': '10',
            'pageNow': '1',
            'sortType': '0',
            'searchType': '0',
            'titleFoldBegin': '-1',
            'titleFoldPage': '-1',
            'urls': 'zjca.miit.gov.cn/n477169/n477283/'  # 关键字段
        }
        response = json.loads(
            requests.post(post_url,
                          data=data,
                          headers=headers,
                          proxies=proxys[-1],
                          timeout=10).text).get('array')

        company_id = get_company_id(company_name)
        if not response:
            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print('公司数据目前查询不到!!!')
            if company_id:
                zhilian = Medicine(company_name=company_name,
                                   gmt_created=times,
                                   gmt_updated=times,
                                   company_id=company_id)
            else:
                zhilian = Medicine(company_name=company_name,
                                   gmt_created=times,
                                   gmt_updated=times)
            object_lists.append(zhilian)
            continue

        if len(response) > 1:
            print('{}有多条信息'.format(company_name))
        detail_url = [i.get('url') for i in response][0]
        head = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
            'If-None-Natch': '',
            'If-Modified-Since': ''
        }
        response = ""
        for IP in range(20):
            try:
                response = requests.request(
                    method='get',
                    url=detail_url,
                    headers=head,
                    proxies=proxys[-1],
                    timeout=10,
                )
                if response.status_code == 200:
                    response = response.content.decode('utf-8')
                    print('获取信息成功!!!')
                    print('break!!!')
                    break
            except Exception:
                dl()
        tree = etree.HTML(response)
        element_list = tree.xpath('//table[@class="table_biaoge"]/tbody')
        for ele in element_list:
            permit_number = ele.xpath('./tr[2]/td[2]/a/text()')[0].strip()
            company_name = ele.xpath('./tr[3]/td[2]/a/text()')[0].strip()
            busi_web = ele.xpath('./tr[4]/td[2]/a/text()')[0]
            if '$content' in busi_web:
                busi_web = '数据显示错误'
            busi_type = ele.xpath('./tr[5]/td[2]/a/text()')[0]
            customer_service_tel = ele.xpath('./tr[6]/td[2]/a/text()')[0]
            if '$content' in customer_service_tel:
                customer_service_tel = '数据显示错误'
            certificate_valid_date = ele.xpath('./tr[7]/td[2]/a/text()')[0]
            certificate_invalid_date = ele.xpath('./tr[8]/td[2]/a/text()')[0]

            times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            print((permit_number, company_name, busi_web, busi_type,
                   customer_service_tel, certificate_valid_date,
                   certificate_invalid_date, company_id))
            if company_id:
                zhilian = Medicine(
                    company_name=company_name,
                    permit_number=permit_number,
                    busi_web=busi_web,
                    busi_type=busi_type,
                    customer_service_tel=customer_service_tel,
                    certificate_valid_date=certificate_valid_date,
                    certificate_invalid_date=certificate_invalid_date,
                    company_id=company_id,
                    gmt_created=times,
                    gmt_updated=times)
            else:
                zhilian = Medicine(
                    company_name=company_name,
                    permit_number=permit_number,
                    busi_web=busi_web,
                    busi_type=busi_type,
                    customer_service_tel=customer_service_tel,
                    certificate_valid_date=certificate_valid_date,
                    certificate_invalid_date=certificate_invalid_date,
                    gmt_created=times,
                    gmt_updated=times)
            object_lists.append(zhilian)

    if len(object_lists) == 0:
        print('本次无更新数据!!!')
    else:
        print('本地数据更新了{}条!!!'.format(len(object_lists)))
        write_db(object_lists)
def main():

    bloom = get_updated()
    sum = []
    old_data = []
    for page in range(1, 15):
        url = "http://wlxsba.smda.sh.cn/openApi/getRecordPage?search=false&nd=1600149091958&rows=20&page={}&sidx=&sord=desc&totalrows=2000".format(
            page)
        response_web = ""
        for IP in range(10):
            try:
                response_web = requests.request(method='get',
                                                url=url,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=15)
                # print(response)
                if response_web.status_code == 200:
                    response_web = response_web.content.decode('utf8')
                    break
            except Exception:
                dl()
        # print(response_web)
        response = json.loads(response_web)
        results = response.get('rows')

        for i in results:
            detail = i.get('ylqxEnp')
            if not detail:
                continue

            company_name = detail.get('baQymc')
            legal_people = detail.get('baFddbr')
            company_principal = detail.get('baQyfzr')
            home_addr = detail.get('baQyzcdz')
            business_addr = detail.get('baJycsdz')
            business_type = detail.get('baJyfs')
            business_range = detail.get('baJyfwzw')
            internet_sold_type = '入驻类'
            social_credit_code = detail.get('enpId')

            record_num = detail.get('baZsbh')
            license_num = detail.get('licNo')
            if record_num and not license_num:
                business_license_num = record_num
            elif license_num and not record_num:
                business_license_num = license_num
            else:
                business_license_num = ",".join([record_num, license_num])

            id = i.get('riId')
            response = ""
            url = 'http://wlxsba.smda.sh.cn/openApi/getRecordDetailData?recordId={}&isOpen=1&_=1600151345630'.format(
                id)
            print(url)
            for IP in range(10):
                try:
                    response = requests.request(method='get',
                                                url=url,
                                                headers=headers,
                                                proxies=proxys[-1],
                                                timeout=15)
                    # print(response)
                    if response.status_code == 200:
                        response = response.content.decode('utf8')
                        break
                except Exception:
                    dl()
            response = json.loads(response)

            flag = response.get('recordInfo').get('ylqxEnpJson')
            flag = json.loads(flag)
            if not company_name:
                company_name = flag.get('baQymc')
                if not company_name:
                    company_name = flag.get('enpName')
            if not legal_people:
                legal_people = flag.get('baFddbr')
                if not legal_people:
                    legal_people = flag.get('fddbr')
            if not company_principal:
                company_principal = flag.get('baQyfzr')
                if not company_principal:
                    company_principal = flag.get('qyfzr')
            if not home_addr:
                home_addr = flag.get('baQyzcdz')
                if not home_addr:
                    home_addr = flag.get('homeAddr')
            if not business_addr:
                business_addr = flag.get('baJycsdz')
                if not business_addr:
                    business_addr = flag.get('registerAddr')
            if not business_type:
                business_type = flag.get('baJyfs')
                if not business_type:
                    business_type = flag.get('jyfsStr')
            if not business_range:
                business_range = flag.get('baJyfwzw')
                if not business_range:
                    business_range = flag.get('jyfw')
            if not social_credit_code:
                social_credit_code = flag.get('enpId')

            main_business = response.get('boornetParameter').get('paraName')
            record_date = response.get('recordInfo').get('completeTime')
            if not record_date:
                record_date = response.get('recordInfo').get('createtime')

            other_info = response.get('recordInfo').get('ylqxEnpJson')

            lis = response.get('enterRecordInfoList')
            plat_form_info = []
            if lis:
                for i in lis:
                    sing = {}
                    join_platform_name = i.get('svPfName')
                    platform_license_num = i.get('svRecodeNum')
                    plat_form_domain = i.get('wsDomainName')
                    plat_form_shop_add = i.get('wsShopName')
                    sing['join_platform_name'] = join_platform_name
                    sing['platform_license_num'] = platform_license_num
                    sing['plat_form_domain'] = plat_form_domain
                    sing['plat_form_shop_add'] = plat_form_shop_add
                    plat_form_info.append(sing)
            company_id = get_company_id(company_name)
            if company_id:
                print((id, company_name, legal_people, company_principal,
                       home_addr, business_addr, business_type, record_date,
                       internet_sold_type, social_credit_code,
                       business_license_num, main_business, business_range,
                       plat_form_info, other_info, company_id))
                times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                if social_credit_code not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        home_addr=home_addr,
                        business_addr=business_addr,
                        business_type=business_type,
                        record_date=record_date,
                        internet_sold_type=internet_sold_type,
                        social_credit_code=social_credit_code,
                        business_license_num=business_license_num,
                        main_business=main_business,
                        company_id=company_id,
                        business_range=business_range,
                        plat_form_info=str(plat_form_info),
                        gmt_created=times,
                        gmt_updated=times,
                        other_info=other_info)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.social_credit_code ==
                        social_credit_code).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        home_addr=home_addr,
                        business_addr=business_addr,
                        business_type=business_type,
                        record_date=record_date,
                        internet_sold_type=internet_sold_type,
                        social_credit_code=social_credit_code,
                        business_license_num=business_license_num,
                        main_business=main_business,
                        business_range=business_range,
                        plat_form_info=str(plat_form_info),
                        gmt_created=times,
                        gmt_updated=times,
                        other_info=other_info,
                        company_id=company_id)
                sum.append(zhilian)
                old_data.append(zhilian)
            else:
                print((id, company_name, legal_people, company_principal,
                       home_addr, business_addr, business_type, record_date,
                       internet_sold_type, social_credit_code,
                       business_license_num, main_business, business_range,
                       plat_form_info, other_info))
                times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                if social_credit_code not in bloom:
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        home_addr=home_addr,
                        business_addr=business_addr,
                        business_type=business_type,
                        record_date=record_date,
                        internet_sold_type=internet_sold_type,
                        social_credit_code=social_credit_code,
                        business_license_num=business_license_num,
                        main_business=main_business,
                        business_range=business_range,
                        plat_form_info=str(plat_form_info),
                        gmt_created=times,
                        gmt_updated=times,
                        other_info=other_info)

                    sum.append(zhilian)
                else:
                    obj_delete = session.query(Medicine).filter(
                        Medicine.social_credit_code ==
                        social_credit_code).all()
                    for i in obj_delete:
                        session.delete(i)
                    zhilian = Medicine(
                        record_id=id,
                        company_name=company_name,
                        legal_people=legal_people,
                        company_principal=company_principal,
                        home_addr=home_addr,
                        business_addr=business_addr,
                        business_type=business_type,
                        record_date=record_date,
                        internet_sold_type=internet_sold_type,
                        social_credit_code=social_credit_code,
                        business_license_num=business_license_num,
                        main_business=main_business,
                        business_range=business_range,
                        plat_form_info=str(plat_form_info),
                        gmt_created=times,
                        gmt_updated=times,
                        other_info=other_info)
                    sum.append(zhilian)
                    old_data.append(zhilian)
            time.sleep(1.5)

    if len(sum) == 0:
        print('本次无更新数据!!!')
    else:
        print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(
            len(sum), len(old_data),
            len(sum) - len(old_data)))
        write_db(sum)