def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1, 11): data = { 'currentPage': page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'apparatusProdBackList', } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') # print(results) if len(results) == 1: results = reload(page) print('跳出reload函数!!!') for i in results: id = i.get('ZSID') company_name = i.get('QYMC_ZW') legal_people = i.get('FRMC_ZW') company_principal = i.get('QYFZR_ZW') recent_record_date = i.get('QFRQ') if recent_record_date: recent_record_date = get_time(recent_record_date) record_mechanism = i.get('FZJG') business_range = "" fw1 = i.get('CPFW_ZW') if fw1: r = '【原《分类目录》分类编码区】:' + fw1 business_range += r else: r = '【原《分类目录》分类编码区】:无' business_range += r fw2 = i.get('CPFW_YW') if fw2: r = '【新《分类目录》分类编码区】:' + fw2 business_range += r else: r = '【新《分类目录》分类编码区】:无' business_range += r home_addr = i.get('ZCDZ_ZW') record_num = i.get('ZSBH') business_addr_list = i.get('scdzList') busi_addr = "" if business_addr_list: if len(business_addr_list) != 0: for aa in business_addr_list: addr = aa.get('SCDZ') busi_addr += addr produce_info = str(i.get('cpxxList')) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id, company_name, legal_people, company_principal, record_num, recent_record_date, record_mechanism, business_range, home_addr, busi_addr, produce_info, company_id)) if record_num not in bloom: zhilian = Medicine(record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, record_num=record_num, recent_record_date=recent_record_date, record_mechanism=record_mechanism, business_range=business_range, company_id=company_id, home_addr=home_addr, busi_addr=busi_addr, produce_info=produce_info, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.record_num == record_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, record_num=record_num, recent_record_date=recent_record_date, record_mechanism=record_mechanism, business_range=business_range, home_addr=home_addr, busi_addr=busi_addr, produce_info=produce_info, company_id=company_id, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) else: print((id, company_name, legal_people, company_principal, record_num, recent_record_date, record_mechanism, business_range, home_addr, busi_addr, produce_info)) if record_num not in bloom: zhilian = Medicine(record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, record_num=record_num, recent_record_date=recent_record_date, record_mechanism=record_mechanism, business_range=business_range, home_addr=home_addr, busi_addr=busi_addr, produce_info=produce_info, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.record_num == record_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, record_num=record_num, recent_record_date=recent_record_date, record_mechanism=record_mechanism, business_range=business_range, home_addr=home_addr, busi_addr=busi_addr, produce_info=produce_info, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(8) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format( len(sum), len(old_data), len(sum) - len(old_data))) write_db(sum)
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1, 6): data = { 'currentPage': page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'durgsList' } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') for i in results: id = i.get('ZSID') company_name = i.get('QYMC_ZW') area = i.get('ZCDZQX') addr = i.get('QYZCDZ') street = i.get('ZCDZJD') warehouse = i.get('CFDZ') legal_people = i.get('FDDBR') company_principal = i.get('QYFZR') quality_principal = i.get('ZLFZR') business_way = i.get('JYFS') business_range = i.get('JYFW') license_num = i.get('ZSBH') license_name = i.get('ZSMC') license_mechanism = i.get('FZJG') license_status = i.get('ZSZT') license_valid_date = i.get('QFRQ') if license_valid_date: license_valid_date = get_time(license_valid_date) license_invalid_date = i.get('YXQZ') if license_invalid_date: license_invalid_date = get_time(license_invalid_date) gsp_license_num = i.get('RZBH') gsp_approve_valid_date = i.get('RZSJ') if gsp_approve_valid_date: gsp_approve_valid_date = get_time(gsp_approve_valid_date) gsp_approve_invalid_date = i.get('RZYXQZ') if gsp_approve_invalid_date: gsp_approve_invalid_date = get_time(gsp_approve_invalid_date) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id, company_name, area, addr, street, warehouse, legal_people, company_principal, quality_principal, business_way, business_range, license_num, license_name, license_mechanism, license_status, license_valid_date, license_invalid_date, gsp_license_num, gsp_approve_valid_date, gsp_approve_invalid_date, company_id)) if license_num not in bloom: zhilian = Medicine( record_id=id, company_name=company_name, area=area, addr=addr, street=street, warehouse=warehouse, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, business_way=business_way, business_range=business_range, license_num=license_num, license_name=license_name, license_mechanism=license_mechanism, license_status=license_status, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, gsp_license_num=gsp_license_num, gsp_approve_valid_date=gsp_approve_valid_date, gsp_approve_invalid_date=gsp_approve_invalid_date, company_id=company_id, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.license_num == license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, company_name=company_name, area=area, addr=addr, street=street, warehouse=warehouse, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, business_way=business_way, business_range=business_range, license_num=license_num, license_name=license_name, license_mechanism=license_mechanism, license_status=license_status, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, gsp_license_num=gsp_license_num, gsp_approve_valid_date=gsp_approve_valid_date, gsp_approve_invalid_date=gsp_approve_invalid_date, company_id=company_id, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) else: print((id, company_name, area, addr, street, warehouse, legal_people, company_principal, quality_principal, business_way, business_range, license_num, license_name, license_mechanism, license_status, license_valid_date, license_invalid_date, gsp_license_num, gsp_approve_valid_date, gsp_approve_invalid_date)) if license_num not in bloom: zhilian = Medicine( record_id=id, company_name=company_name, area=area, addr=addr, street=street, warehouse=warehouse, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, business_way=business_way, business_range=business_range, license_num=license_num, license_name=license_name, license_mechanism=license_mechanism, license_status=license_status, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, gsp_license_num=gsp_license_num, gsp_approve_valid_date=gsp_approve_valid_date, gsp_approve_invalid_date=gsp_approve_invalid_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.license_num == license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, company_name=company_name, area=area, addr=addr, street=street, warehouse=warehouse, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, business_way=business_way, business_range=business_range, license_num=license_num, license_name=license_name, license_mechanism=license_mechanism, license_status=license_status, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, gsp_license_num=gsp_license_num, gsp_approve_valid_date=gsp_approve_valid_date, gsp_approve_invalid_date=gsp_approve_invalid_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(5) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format( len(sum), len(old_data), len(sum) - len(old_data))) write_db(sum)
def parse(sum): all_data = [] for i in sum: file_url = i['file_url'] file_name = i['file_name'] response = "" for IP in range(10): try: response = requests.get(url=file_url, headers=headers, proxies=proxys[-1], timeout=10) if response.status_code == 200: response = response.content.decode('utf-8') print('break') break except Exception: dl() try: key1 = re.findall(r'申请单位/申请人:</span>(.*?)<br/>', response, re.S) key2 = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S) key3 = re.findall(r'申请单位/申请人:</span> (.*?)</p>', response, re.S) except Exception as e: print('数据获取失败!!!可能出现404!!!跳过!') continue if len(key1) != 0 or len(key2) != 0: release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0] business_water_number = re.findall(r'业务流水号:</span>(.*?)<br/>', response, re.S) if len(business_water_number) != 0: business_water_number = business_water_number[0] else: lis = re.findall(r'业务流水号:(.*?)<br/>', response, re.S) business_water_number = lis[0] company_name = re.findall(r'申请单位/申请人:</span>(.*?)<br/>', response, re.S) if len(company_name) != 0: company_name = company_name[0] else: lis = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S) company_name = lis[0] matter_name = re.findall(r'事项名称:</span>(.*?)<br/>', response, re.S) if len(matter_name) != 0: matter_name = matter_name[0] else: lis = re.findall(r'事项名称:(.*?)<br/>', response, re.S) matter_name = lis[0] project_name = re.findall(r'项目名称:</span>(.*?)<br/>', response, re.S) if len(project_name) != 0: project_name = project_name[0] else: lis = re.findall(r'项目名称:(.*?)<br/>', response, re.S) project_name = lis[0] business_type = re.findall(r'业务类型:</span>(.*?)<br/>', response, re.S) if len(business_type) != 0: business_type = business_type[0] else: lis = re.findall(r'业务类型:(.*?)<br/>', response, re.S) business_type = lis[0] accept_manager_people1 = re.findall(r'受理经办人:</span>(.*?)<br/>', response, re.S) accept_manager_people2 = re.findall(r'受理经办人:(.*?)<br/>', response, re.S) if len(accept_manager_people1) != 0: accept_manager_people = accept_manager_people1[0] elif len(accept_manager_people2) != 0: accept_manager_people = accept_manager_people2[0] else: accept_manager_people = 'null' current_type = re.findall(r'当前状态:</span>.*?<span .*?">(.*?)</span>', response, re.S) if len(current_type) != 0: current_type = current_type[0] else: lis = re.findall(r'当前状态:.*?<span .*?">(.*?)</span>', response, re.S) current_type = lis[0] approval_date = re.findall(r'<span style="font-size: 18px;">(.*?)</span>', response, re.S) if len(approval_date) != 0: approval_date = approval_date[-1] else: approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1] data_source = '浙江省文化和旅游厅' file_url = file_url.strip() file_name = file_name.strip() company_name = company_name.strip() matter_name = matter_name.strip() approval_date = approval_date.strip() release_date = release_date.strip() business_water_number = business_water_number.strip() project_name = project_name.strip() business_type = business_type.strip() accept_manager_people = accept_manager_people.strip() current_type = current_type.strip() data_source = data_source.strip() administrative_license_matter = 'null' province = 'null' area = 'null' business_position = 'null' business_license_number = 'null' times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number, company_id)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times, company_id=company_id) all_data.append(zhilian) else: print((file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times) all_data.append(zhilian) elif len(key3) != 0: release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0] business_water_number = re.findall(r'业务流水号:</span>(.*?)</span>', response, re.S) if len(business_water_number) != 0: business_water_number = business_water_number[0] else: lis = re.findall(r'业务流水号:(.*?)<br/>', response, re.S) business_water_number = lis[0] company_name = re.findall(r'申请单位/申请人:</span>(.*?)</p>', response, re.S) if len(company_name) != 0: company_name = company_name[0] else: lis = re.findall(r'申请单位/申请人:(.*?)<br/>', response, re.S) company_name = lis[0] matter_name = re.findall(r'事项名称:</span>(.*?)</p>', response, re.S) if len(matter_name) != 0: matter_name = matter_name[0] else: lis = re.findall(r'事项名称:(.*?)<br/>', response, re.S) matter_name = lis[0] project_name = re.findall(r'项目名称:</span>(.*?)</p>', response, re.S) if len(project_name) != 0: project_name = project_name[0] else: lis = re.findall(r'项目名称:(.*?)<br/>', response, re.S) project_name = lis[0] business_type = re.findall(r'业务类型:</span>(.*?)</p>', response, re.S) if len(business_type) != 0: business_type = business_type[0] else: lis = re.findall(r'业务类型:(.*?)<br/>', response, re.S) business_type = lis[0] accept_manager_people1 = re.findall(r'受理经办人:</span>(.*?)</p>', response, re.S) accept_manager_people2 = re.findall(r'受理经办人:(.*?)</p>', response, re.S) if len(accept_manager_people1) != 0: accept_manager_people = accept_manager_people1[0] elif len(accept_manager_people2) != 0: accept_manager_people = accept_manager_people2[0] else: accept_manager_people = 'null' current_type = re.findall(r'当前状态:</span>.*?<span .*?">(.*?)</span>', response, re.S) if len(current_type) != 0: current_type = current_type[0] else: lis = re.findall(r'当前状态:.*?<span .*?">(.*?)</span>', response, re.S) current_type = lis[0] approval_date = re.findall(r'浙江省文化厅</p><p .*?">(.*?)</p>', response, re.S) if len(approval_date) != 0: approval_date = approval_date[0] else: approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1] data_source = '浙江省文化和旅游厅' file_url = file_url.strip() file_name = file_name.strip() company_name = company_name.strip().replace(' ', '') matter_name = matter_name.strip().replace(' ', '') approval_date = approval_date.strip() release_date = release_date.strip() business_water_number = business_water_number.strip().replace(' ', '') project_name = project_name.strip() business_type = business_type.strip() accept_manager_people = accept_manager_people.strip() current_type = current_type.strip() data_source = data_source.strip() administrative_license_matter = 'null' province = 'null' area = 'null' business_position = 'null' business_license_number = 'null' times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number, company_id)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times, company_id=company_id) all_data.append(zhilian) else: print((file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times) all_data.append(zhilian) else: release_date = re.findall(r'发布日期:(.*?)</span>', response, re.S)[0] administrative_license_matter = re.findall(r'行政许可事项:(.*?)<br/>', response, re.S) if len(administrative_license_matter) != 0: administrative_license_matter = administrative_license_matter[0] else: lis = re.findall(r'行政许可事项:(.*?)<br />', response, re.S) administrative_license_matter = lis[0] company_name = re.findall(r'单位名称:(.*?)<br/>', response, re.S) if len(company_name) != 0: company_name = company_name[0] else: lis = re.findall(r'单位名称:(.*?)<br />', response, re.S) company_name = lis[0] province = re.findall(r'省份:(.*?)<br/>', response, re.S) if len(province) != 0: province = province[0] else: lis = re.findall(r'省份:(.*?)<br />', response, re.S) province = lis[0] area = re.findall(r'地市:(.*?)<br/>', response, re.S) if len(area) != 0: area = area[0] else: lis = re.findall(r'地市:(.*?)<br />', response, re.S) area = lis[0] business_position = re.findall(r'营业场所:(.*?)<br/>', response, re.S) if len(business_position) != 0: business_position = business_position[0] else: lis = re.findall(r'营业场所:(.*?)<br />', response, re.S) business_position = lis[0] business_license_number = re.findall(r'营业许可证号:(.*?)<br/>', response, re.S) if len(business_license_number) != 0: business_license_number = business_license_number[0] else: lis = re.findall(r'营业许可证号:(.*?)<br />', response, re.S) business_license_number = lis[0] current_type = re.findall(r'当前状态:<span .*?">(.*?)</span>', response, re.S)[0] approval_date = re.findall(r'<span style="font-size: 18px;">(.*?)</span>', response, re.S) if len(approval_date) != 0: approval_date = approval_date[-1] else: approval_date = re.findall(r'<span style=\\"font-size: 18px;\\">(.*?)</span>', response, re.S)[-1] data_source = '浙江省文化和旅游厅' file_url = file_url.strip() file_name = file_name.strip() company_name = company_name.strip() matter_name = 'null' approval_date = approval_date.strip() release_date = release_date.strip() business_water_number = 'null' project_name = 'null' business_type = 'null' accept_manager_people = 'null' current_type = current_type.strip() data_source = data_source.strip() administrative_license_matter = administrative_license_matter.strip() province = province.strip() area = area.strip() business_position = business_position.strip() business_license_number = business_license_number.strip() times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print( (file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number, company_id)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times, company_id=company_id) all_data.append(zhilian) else: print( (file_url, file_name, company_name, matter_name, approval_date, release_date, business_water_number, project_name, business_type, accept_manager_people, current_type, data_source, administrative_license_matter, province, area, business_position, business_license_number)) zhilian = Medicine(file_url=file_url, company_name=company_name, matter_name=matter_name, approval_date=approval_date, release_date=release_date, business_water_number=business_water_number, project_name=project_name, business_type=business_type, accept_manager_people=accept_manager_people, current_type=current_type, data_source=data_source, administrative_license_matter=administrative_license_matter, province=province, area=area, business_position=business_position, business_license_number=business_license_number, gmt_created=times, gmt_updated=times) all_data.append(zhilian) time.sleep(0.8) write_db(all_data)
def main(): bloom = get_updated() sum = [] data = { '_currpage': 1, '_pagelines': 20, '_rowcount': 56, '_selectpage': 1 } url = "http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/queryLwpqxkdw.action;jsessionid=5pN8HiGEIhh85GYcbqG_ldLunPVxbSP_viiSnSz35dxeS8LGMRY5!1300760266" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, headers=headers, data=data, proxies=proxys[-1], timeout=15) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break time.sleep(2) except Exception: dl() time.sleep(1.5) results = int( re.findall(r"<font color='red'>(.*?)</font>页", response_web, re.S)[-1]) for page in range(1, results + 1): data = { '_currpage': page, '_pagelines': 20, '_rowcount': 56, '_selectpage': page } url = "http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/queryLwpqxkdw.action;jsessionid=5pN8HiGEIhh85GYcbqG_ldLunPVxbSP_viiSnSz35dxeS8LGMRY5!1300760266" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, headers=headers, data=data, proxies=proxys[-1], timeout=15) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break time.sleep(2) except Exception: dl() time.sleep(1.5) results = re.findall(r'javascript:toGetInfo(.*?);', response_web, re.S) print(results) for j in results: keyword = j.split('(')[-1].split(')')[0] url = 'http://rsz.zjhz.hrss.gov.cn/jyhptweb/cycx/checkLwpqxkdw.action?dwid={}'.format( keyword) response = "" for IP in range(10): try: response = requests.request(method='post', url=url, headers=headers, data=data, proxies=proxys[-1], timeout=15) # print(response) if response.status_code == 200: response = response.content.decode('utf8') break time.sleep(2) except Exception: dl() time.sleep(1.5) company_name = re.findall(r'<h4>(.*?)</h4>', response, re.S)[0] register_position = re.findall(r'注册地址:.*?<td>(.*?) </td>', response, re.S)[0] connect_people = re.findall(r'联系人:.*?<td >(.*?) </td>', response, re.S)[0] connect_tel = re.findall(r'联系电话:.*?<td>(.*?) </td>', response, re.S)[0] business_license_number = re.findall( r'劳务派遣经营许可证号码:.*?<td >(.*?) </td>', response, re.S)[0] license_authority = re.findall(r'许可机关.*?<td >(.*?) </td>', response, re.S)[0] valid_date = re.findall(r'有效期:.*?<td >(.*?) </td>', response, re.S)[0] year_business_report = re.findall( r'提交年度经营报告情况.*?<td >(.*?) </td>', response, re.S)[0] times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if business_license_number not in bloom: company_id = get_company_id(company_name) if company_id: print((company_name, register_position, connect_people, connect_tel, business_license_number, license_authority, valid_date, year_business_report, company_id)) zhilian = Medicine( company_name=company_name, register_position=register_position, connect_people=connect_people, connect_tel=connect_tel, business_license_number=business_license_number, license_authority=license_authority, valid_date=valid_date, year_business_report=year_business_report, gmt_created=times, gmt_updated=times, company_id=company_id) sum.append(zhilian) else: print( (company_name, register_position, connect_people, connect_tel, business_license_number, license_authority, valid_date, year_business_report)) zhilian = Medicine( company_name=company_name, register_position=register_position, connect_people=connect_people, connect_tel=connect_tel, business_license_number=business_license_number, license_authority=license_authority, valid_date=valid_date, year_business_report=year_business_report, gmt_created=times, gmt_updated=times) sum.append(zhilian) time.sleep(0.5) if len(sum) == 0: print('此次没有数据更新!!!') else: print('此次更新数据有{}条!!!'.format(len(sum))) write_db(sum)
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1,3): data = { 'currentPage':page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'apparatusWTProdList', } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data = data,headers=headers,proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') if len(results) == 1: results = reload(page) print('跳出reload函数!!!') for i in results: id = i.get('CPZCH_BAH') entrust_company_name =i.get('QYMC') entrust_license_num = i.get('XKZBH') entrust_legal = i.get('FDDBR') entrust_company_principal = i.get('QYFZR') entrust_addr = i.get('QYZCDZ') entrust_produce_addr = i.get('SCDZ') be_entrust_company_name = i.get('SWTQYMC') be_entrust_license_num = i.get('SWTQYSCXKZ') be_entrust_legal = i.get('SWTFDDBR') be_entrust_company_principal = i.get('SWTQYFZR') be_entrust_addr = i.get('SWTQYZCDZ') be_entrust_produce_addr = i.get('SWTQYSCDZ') entrust_product_name = i.get('CPMC') product_license_num = i.get('CPZCH_BAH') entrust_date = i.get('WTQXRQ') record_date = i.get('QFRQ') if record_date: record_date = get_time(record_date) if entrust_date: entrust_date = get_time(entrust_date) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(entrust_company_name) if company_id: print((id, entrust_company_name, entrust_license_num, entrust_legal, entrust_company_principal, entrust_addr, entrust_produce_addr, be_entrust_company_name, be_entrust_license_num, be_entrust_legal, be_entrust_company_principal, be_entrust_addr, be_entrust_produce_addr, entrust_product_name, product_license_num, record_date, entrust_date,company_id )) if product_license_num not in bloom: zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name, entrust_license_num=entrust_license_num, entrust_legal=entrust_legal, entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr, entrust_produce_addr=entrust_produce_addr, be_entrust_company_name=be_entrust_company_name, be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal, be_entrust_company_principal=be_entrust_company_principal, be_entrust_addr=be_entrust_addr, be_entrust_produce_addr=be_entrust_produce_addr, entrust_product_name=entrust_product_name, product_license_num=product_license_num, record_date=record_date, entrust_date=entrust_date, gmt_created=times,company_id=company_id, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter(Medicine.product_license_num == product_license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name, entrust_license_num=entrust_license_num, entrust_legal=entrust_legal, entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr, entrust_produce_addr=entrust_produce_addr, be_entrust_company_name=be_entrust_company_name, be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal, be_entrust_company_principal=be_entrust_company_principal, be_entrust_addr=be_entrust_addr, be_entrust_produce_addr=be_entrust_produce_addr, entrust_product_name=entrust_product_name, product_license_num=product_license_num, record_date=record_date, entrust_date=entrust_date, gmt_created=times, company_id=company_id, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) else: print((id, entrust_company_name, entrust_license_num, entrust_legal, entrust_company_principal, entrust_addr, entrust_produce_addr, be_entrust_company_name, be_entrust_license_num, be_entrust_legal, be_entrust_company_principal, be_entrust_addr, be_entrust_produce_addr, entrust_product_name, product_license_num, record_date, entrust_date, )) if product_license_num not in bloom: zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name, entrust_license_num=entrust_license_num, entrust_legal=entrust_legal, entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr, entrust_produce_addr=entrust_produce_addr, be_entrust_company_name=be_entrust_company_name, be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal, be_entrust_company_principal=be_entrust_company_principal, be_entrust_addr=be_entrust_addr, be_entrust_produce_addr=be_entrust_produce_addr, entrust_product_name=entrust_product_name, product_license_num=product_license_num, record_date=record_date, entrust_date=entrust_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.product_license_num == product_license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, entrust_company_name=entrust_company_name, entrust_license_num=entrust_license_num, entrust_legal=entrust_legal, entrust_company_principal=entrust_company_principal, entrust_addr=entrust_addr, entrust_produce_addr=entrust_produce_addr, be_entrust_company_name=be_entrust_company_name, be_entrust_license_num=be_entrust_license_num, be_entrust_legal=be_entrust_legal, be_entrust_company_principal=be_entrust_company_principal, be_entrust_addr=be_entrust_addr, be_entrust_produce_addr=be_entrust_produce_addr, entrust_product_name=entrust_product_name, product_license_num=product_license_num, record_date=record_date, entrust_date=entrust_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(5) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(len(sum),len(old_data),len(sum)-len(old_data))) write_db(sum)
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1, 6): data = { 'currentPage': page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'apparatusDealList', } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=20) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') # print(results) if len(results) == 1: results = reload(page) print('跳出reload函数!!!') for i in results: id = i.get('ZSID') area = i.get('JYDZQX') company_name = i.get('QYMC_ZW') license_name = i.get('ZSMC') license_num = i.get('ZSBH') business_type = i.get('JYMS') supervision_classification = i.get('FLJGJB') supervision_sort = i.get('ZDJGLB') legal_people = i.get('FDDBR') company_principal = i.get('QYFZR') addr = i.get('QYZSDZ') business_addr = i.get('JYCS') warehouse_addr = i.get('CFDZ') business_way = i.get('JYFS') third_business_range = "" fw1 = i.get('JYFW') if fw1: r = '【原《分类目录》分类编码区】:' + fw1 third_business_range += r else: r = '【原《分类目录》分类编码区】:无' third_business_range += r fw2 = i.get('CPFW_YW') if fw2: r = '【新《分类目录》分类编码区】:' + fw2 third_business_range += r else: r = '【新《分类目录》分类编码区】:无' third_business_range += r license_valid_date = i.get('QFRQ') if license_valid_date: license_valid_date = get_time(license_valid_date) license_invalid_date = i.get('YXQZ') if license_invalid_date: license_invalid_date = get_time(license_invalid_date) two_business_range = i.get('BAJYFW') record_date = i.get('BAQFRQ') if record_date: record_date = get_time(record_date) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id, area, company_name, license_name, license_num, business_type, supervision_classification, supervision_sort, legal_people, company_principal, addr, business_addr, warehouse_addr, business_way, third_business_range, license_valid_date, license_invalid_date, two_business_range, record_date, company_id)) if license_num not in bloom: zhilian = Medicine( record_id=id, area=area, company_name=company_name, license_name=license_name, license_num=license_num, business_type=business_type, supervision_classification=supervision_classification, supervision_sort=supervision_sort, legal_people=legal_people, company_principal=company_principal, addr=addr, business_addr=business_addr, warehouse_addr=warehouse_addr, business_way=business_way, third_business_range=third_business_range, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, two_business_range=two_business_range, record_date=record_date, gmt_created=times, gmt_updated=times, company_id=company_id) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.license_num == license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, area=area, company_name=company_name, license_name=license_name, license_num=license_num, business_type=business_type, supervision_classification=supervision_classification, supervision_sort=supervision_sort, legal_people=legal_people, company_principal=company_principal, addr=addr, business_addr=business_addr, warehouse_addr=warehouse_addr, business_way=business_way, third_business_range=third_business_range, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, two_business_range=two_business_range, record_date=record_date, gmt_created=times, gmt_updated=times, company_id=company_id) sum.append(zhilian) old_data.append(zhilian) else: print((id, area, company_name, license_name, license_num, business_type, supervision_classification, supervision_sort, legal_people, company_principal, addr, business_addr, warehouse_addr, business_way, third_business_range, license_valid_date, license_invalid_date, two_business_range, record_date)) if license_num not in bloom: zhilian = Medicine( record_id=id, area=area, company_name=company_name, license_name=license_name, license_num=license_num, business_type=business_type, supervision_classification=supervision_classification, supervision_sort=supervision_sort, legal_people=legal_people, company_principal=company_principal, addr=addr, business_addr=business_addr, warehouse_addr=warehouse_addr, business_way=business_way, third_business_range=third_business_range, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, two_business_range=two_business_range, record_date=record_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.license_num == license_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, area=area, company_name=company_name, license_name=license_name, license_num=license_num, business_type=business_type, supervision_classification=supervision_classification, supervision_sort=supervision_sort, legal_people=legal_people, company_principal=company_principal, addr=addr, business_addr=business_addr, warehouse_addr=warehouse_addr, business_way=business_way, third_business_range=third_business_range, license_valid_date=license_valid_date, license_invalid_date=license_invalid_date, two_business_range=two_business_range, record_date=record_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(8) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format( len(sum), len(old_data), len(sum) - len(old_data))) write_db(sum)
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1,3): data = { 'currentPage': page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'drugProductList', } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') # print(results) if len(results) == 1: results = reload(page) print('跳出reload函数!!!') for i in results: id = i.get('ZSBH') company_name = i.get('QYMC_ZW') license_num = i.get('ZSBH') classifi_num = i.get('CPFWLB') social_credit_code = i.get('SHXYDM') license_invalid_date = i.get('YXQZ') recent_update_date = i.get('QFRQ') legal_people = i.get('FRMC_ZW') company_principal = i.get('QYFZR_ZW') register_addr = i.get('ZCDZ_ZW') produce_addr_and_range = i.get('CPFW_ZW') quality_principal = i.get('ZLFZR') produce_principal = i.get('SCFZR') quality_authorization = i.get('ZLSQR') license_type = i.get('ZSZT') if license_type == '10': license_type = '有效' else: license_type = '注销' times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id, company_name, license_num, classifi_num, social_credit_code, license_invalid_date, recent_update_date, legal_people, company_principal, register_addr, produce_addr_and_range, quality_principal, produce_principal, quality_authorization, license_type,company_id )) if social_credit_code not in bloom: zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num, legal_people=legal_people, company_principal=company_principal, license_invalid_date=license_invalid_date, recent_update_date=recent_update_date, register_addr=register_addr, classifi_num=classifi_num, social_credit_code=social_credit_code, produce_addr_and_range=produce_addr_and_range, quality_principal=quality_principal, produce_principal=produce_principal, quality_authorization=quality_authorization,company_id=company_id, license_type=license_type, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter(Medicine.social_credit_code == social_credit_code).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num, legal_people=legal_people, company_principal=company_principal, license_invalid_date=license_invalid_date, recent_update_date=recent_update_date, register_addr=register_addr, classifi_num=classifi_num, social_credit_code=social_credit_code, produce_addr_and_range=produce_addr_and_range, quality_principal=quality_principal, produce_principal=produce_principal, quality_authorization=quality_authorization, company_id=company_id, license_type=license_type, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) else: print((id, company_name, license_num, classifi_num, social_credit_code, license_invalid_date, recent_update_date, legal_people, company_principal, register_addr, produce_addr_and_range, quality_principal, produce_principal, quality_authorization, license_type, )) if social_credit_code not in bloom: zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num, legal_people=legal_people, company_principal=company_principal, license_invalid_date=license_invalid_date, recent_update_date=recent_update_date, register_addr=register_addr, classifi_num=classifi_num, social_credit_code=social_credit_code, produce_addr_and_range=produce_addr_and_range, quality_principal=quality_principal, produce_principal=produce_principal, quality_authorization=quality_authorization, license_type=license_type, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter(Medicine.social_credit_code == social_credit_code).all() for i in obj_delete: session.delete(i) zhilian = Medicine(record_id=id, company_name=company_name, license_num=license_num, legal_people=legal_people, company_principal=company_principal, license_invalid_date=license_invalid_date, recent_update_date=recent_update_date, register_addr=register_addr, classifi_num=classifi_num, social_credit_code=social_credit_code, produce_addr_and_range=produce_addr_and_range, quality_principal=quality_principal, produce_principal=produce_principal, quality_authorization=quality_authorization, license_type=license_type, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(8) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format(len(sum),len(old_data),len(sum)-len(old_data))) write_db(sum)
def main(): bloom = get_updated() sum = [] data = { 'currentPage': '1', 'pageSize': '10', 'groupSize': '8', 'pageName': 'drugProductList-Time-Limit' } url = "http://xuke.smda.sh.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) pages = int(response.get('totalPage')) print('数据总数:{}页!!!'.format(pages)) time.sleep(3) for page in range(1,pages+1): data = { 'currentPage':page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'drugProductList-Time-Limit' } url = "http://xuke.smda.sh.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data = data,headers=headers,proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') for i in results: id = i.get('ZSBH') show_time = i.get('RESERVED4') license_num = i.get('ZSBH') company_name = i.get('QYMC_ZW') classifi_code = i.get('CPFWLB') social_credit_code = i.get('SHXYDM') addr = i.get('ZCDZ_ZW') legal_people = i.get('FRMC_ZW') company_principal = i.get('QYFZR_ZW') quality_principal = i.get('ZLFZR') produce_principal = i.get('SCFZR') quality_authorize = i.get('ZLSQR') produce_addr_range = i.get('CPFW_ZW') times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id,show_time,license_num,company_name,classifi_code,social_credit_code,addr,legal_people,company_principal,quality_principal, produce_principal,quality_principal,quality_authorize,produce_addr_range,company_id)) if id not in bloom: zhilian = Medicine(record_id=id, show_time=show_time, license_num=license_num, company_name=company_name, classifi_code=classifi_code, social_credit_code=social_credit_code, addr=addr, legal_people=legal_people,company_principal=company_principal, quality_principal=quality_principal,produce_principal=produce_principal, quality_authorize=quality_authorize, produce_addr_range=produce_addr_range, gmt_created=times, gmt_updated=times,company_id=company_id) sum.append(zhilian) else: print((id, show_time, license_num, company_name, classifi_code, social_credit_code, addr, legal_people, company_principal, quality_principal, produce_principal, quality_principal, quality_authorize, produce_addr_range)) if id not in bloom: zhilian = Medicine(record_id=id, show_time=show_time, license_num=license_num, company_name=company_name, classifi_code=classifi_code, social_credit_code=social_credit_code, addr=addr, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, produce_principal=produce_principal, quality_authorize=quality_authorize, produce_addr_range=produce_addr_range, gmt_created=times, gmt_updated=times) sum.append(zhilian) time.sleep(5) if len(sum) == 0: print('本次无更新数据!!!') else: print('本地数据更新了{}条!!!'.format(len(sum))) write_db(sum)
'spider_add_value_telecom_info', 'spider_company_city_level_makerspace', 'spider_company_honor_data', 'spider_company_province_level_makerspace', 'spider_company_province_tech_incubator', 'spider_culture_business_license', 'spider_industry_information', 'spider_outstand_talent', 'spider_radio_show_business_license', 'spider_service_license', 'spider_high_talent', 'spider_talent_room' ] # ,'spider_company_related_park'] for table in all_tables: sql = '''select id,company_name from {} where company_id is NULL '''.format( table) cursor.execute(sql) single_data = cursor.fetchall() print(single_data) for i in single_data: id = i[0] company_name = i[1] company_id = get_company_id(company_name) if company_id: update_sql = '''update {} set company_id = {} where id = {}'''.format( table, company_id, id) cursor.execute(update_sql) print('{}表中,公司{}新增了company_id字段'.format(table, company_name)) else: continue # time.sleep(0.5) # break conn.commit() conn.close()
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1, 3): data = { 'currentPage': page, 'pageSize': '10', 'groupSize': '8', 'pageName': 'ylqxwljyList', } url = "http://xuke.yjj.sh.gov.cn/AppRoveManage/selectLicense/selectData" response_web = "" for IP in range(10): try: response_web = requests.request(method='post', url=url, data=data, headers=headers, proxies=proxys[-1], timeout=10) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() response = json.loads(response_web) results = response.get('rowData') # print(results) if len(results) == 1: results = reload(page) print('跳出reload函数!!!') for i in results: id = i.get('ZSID') record_num = i.get('ZSBH') company_name = i.get('QYMC_ZW') addr = i.get('ZCDZ_ZW') business_addr = i.get('SCDZ_ZW') legal_people = i.get('FRMC_ZW') company_principal = i.get('QYFZR_ZW') quality_principal = i.get('ZLAQGLR') web_name = i.get('WZMC') web_program_name = i.get('WLKHD') domain_name = i.get('WZYM') ip = i.get('WZIPDZ') service_machine_addr = i.get('FWQCFDZ') non_profit_internet_service_record_num = i.get('FJYXHLWXXBABH') record_mechanism = i.get('FZJG') record_date = i.get('QFRQ') if record_date: record_date = get_time(record_date) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((id, record_num, company_name, addr, business_addr, legal_people, company_principal, quality_principal, web_name, web_program_name, domain_name, ip, service_machine_addr, non_profit_internet_service_record_num, record_mechanism, record_date, company_id)) if record_num not in bloom: zhilian = Medicine( record_id=id, record_num=record_num, company_name=company_name, addr=addr, business_addr=business_addr, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, web_name=web_name, web_program_name=web_program_name, domain_name=domain_name, ip=ip, service_machine_addr=service_machine_addr, non_profit_internet_service_record_num= non_profit_internet_service_record_num, record_mechanism=record_mechanism, record_date=record_date, gmt_created=times, company_id=company_id, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.record_num == record_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, record_num=record_num, company_name=company_name, addr=addr, business_addr=business_addr, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, web_name=web_name, web_program_name=web_program_name, domain_name=domain_name, ip=ip, service_machine_addr=service_machine_addr, non_profit_internet_service_record_num= non_profit_internet_service_record_num, record_mechanism=record_mechanism, record_date=record_date, gmt_created=times, company_id=company_id, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) else: print(( id, record_num, company_name, addr, business_addr, legal_people, company_principal, quality_principal, web_name, web_program_name, domain_name, ip, service_machine_addr, non_profit_internet_service_record_num, record_mechanism, record_date, )) if record_num not in bloom: zhilian = Medicine( record_id=id, record_num=record_num, company_name=company_name, addr=addr, business_addr=business_addr, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, web_name=web_name, web_program_name=web_program_name, domain_name=domain_name, ip=ip, service_machine_addr=service_machine_addr, non_profit_internet_service_record_num= non_profit_internet_service_record_num, record_mechanism=record_mechanism, record_date=record_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.record_num == record_num).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, record_num=record_num, company_name=company_name, addr=addr, business_addr=business_addr, legal_people=legal_people, company_principal=company_principal, quality_principal=quality_principal, web_name=web_name, web_program_name=web_program_name, domain_name=domain_name, ip=ip, service_machine_addr=service_machine_addr, non_profit_internet_service_record_num= non_profit_internet_service_record_num, record_mechanism=record_mechanism, record_date=record_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) old_data.append(zhilian) time.sleep(8) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format( len(sum), len(old_data), len(sum) - len(old_data))) write_db(sum)
def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', } sum = [] for page in range(1500, 2098): url = 'https://zwfw.miit.gov.cn/miit/resultSearch?wd=&categoryTreeId=302&categoryTreePid=&pagenow={}'.format( page) response = "" for IP in range(20): try: response = requests.request(method='get', url=url, headers=headers, proxies=proxys[-1], timeout=10) if response.status_code == 200: response = response.content.decode('utf8') print('获取信息成功!!!') print('break!!!') break except Exception: dl() tree = etree.HTML(response) element_list = tree.xpath( '//table[@class="table table-bordered table-responsive"]/tbody/tr') for ele in element_list: permit_number = ele.xpath('./td[2]/@title')[0] company_name = ele.xpath('./td[3]/@title')[0] busi_type = ele.xpath('./td[4]/@title')[0] busi_range = ele.xpath('./td[5]/@title')[0] invalid_date = ele.xpath('./td[6]/@title')[0] times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) print(permit_number, company_name, busi_type, busi_range, invalid_date) if company_id: zhilian = Medicine(permit_number=permit_number, company_name=company_name, busi_type=busi_type, busi_range=busi_range, invalid_date=invalid_date, gmt_created=times, gmt_updated=times, company_id=company_id) sum.append(zhilian) else: zhilian = Medicine(permit_number=permit_number, company_name=company_name, busi_type=busi_type, busi_range=busi_range, invalid_date=invalid_date, gmt_created=times, gmt_updated=times) sum.append(zhilian) time.sleep(2) write_db(sum)
def parse(data): bloom = get_updated() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'If-None-Natch': '', 'If-Modified-Since': '' } f = os.path.join(file_path, 'files') shutil.rmtree(f) if not os.path.exists(f): os.mkdir(f) sum = [] for i in data: url = i.get('download_file_url') name = i.get('file_name') if url.endswith('xls'): file_name = '{}/{}.xls'.format(f, name) else: file_name = '{}/{}.xlsx'.format(f, name) response = requests.request(method='get', url=url, headers=headers).content with open(file_name, 'wb') as fP: fP.write(response) for i in data: url = i.get('download_file_url') name = i.get('file_name') if url.endswith('xls'): file_name = '{}/{}.xls'.format(f, name) else: file_name = '{}/{}.xlsx'.format(f, name) workbook = open_workbook(file_name) # 打开excel文件 sheet2 = workbook.sheet_by_index(0) all_rows = sheet2.row_values(1) for index, i in enumerate(all_rows): if i.endswith('称'): num = index break for i in range(2, sheet2.nrows): permit_number = sheet2.cell(i, 0).value.strip() if '不予' in permit_number and '浙' in permit_number: permit_number = ''.join(permit_number.split('不予许可')).strip() permit_number = ''.join(permit_number.split('不予受理')).strip() if permit_number in bloom: print(permit_number, name) continue company_name = sheet2.cell(i, num).value.strip() headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '332', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'asopSearchUserName=C0E6C927-A7FA-F8E8-197C-339630838038; lastAccessTime=1599630838038; JSESSIONID=5E36F6AD6962754A4EF96AE3E5BBE0BE; lastLoginTime=1599634000115', 'Host': '202.106.121.52:8580', 'Origin': 'http://202.106.121.52:8580', 'Referer': 'http://202.106.121.52:8580/searchweb/query.jsp', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } post_url = "http://202.106.121.52:8580/searchweb/search" data = { 'pageSize': '10', 'pageNow': '1', 'zjbh': permit_number, 'sortType': '0', 'searchType': '0', 'titleFoldBegin': '-1', 'titleFoldPage': '-1', 'urls': 'zjca.miit.gov.cn/n477169/n477283/' # 关键字段 } response = json.loads( requests.post(post_url, data=data, headers=headers, proxies=proxys[-1], timeout=10).text).get('array') if not response: times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('公司数据目前查询不到!!!') print(company_name, permit_number, name) zhilian = Medicine(company_name=company_name, permit_number=permit_number, file_name=name, gmt_created=times, gmt_updated=times) sum.append(zhilian) continue detail_url = [i.get('url') for i in response][0] head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'If-None-Natch': '', 'If-Modified-Since': '' } response = "" for IP in range(10): try: response = requests.request( method='get', url=detail_url, headers=head, proxies=proxys[-1], timeout=10, ) if response.status_code == 200: response = response.content.decode('utf-8') print('获取信息成功!!!') print('break!!!') break except Exception: dl() tree = etree.HTML(response) element_list = tree.xpath('//table[@class="table_biaoge"]/tbody') for ele in element_list: permit_number = ele.xpath('./tr[2]/td[2]/a/text()')[0].strip() company_name = ele.xpath('./tr[3]/td[2]/a/text()')[0].strip() domain_info = ele.xpath('./tr[4]/td[2]/a/text()')[0] if '$content' in domain_info: domain_info = '数据显示错误' business_type = ele.xpath('./tr[5]/td[2]/a/text()')[0] customer_service_tel = ele.xpath('./tr[6]/td[2]/a/text()')[0] if '$content' in customer_service_tel: customer_service_tel = '数据显示错误' certificate_valid_date = ele.xpath('./tr[7]/td[2]/a/text()')[0] certificate_invalid_date = ele.xpath( './tr[8]/td[2]/a/text()')[0] times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) company_id = get_company_id(company_name) if company_id: print((permit_number, company_name, domain_info, business_type, customer_service_tel, certificate_valid_date, certificate_invalid_date, name, detail_url, company_id)) zhilian = Medicine( company_name=company_name, permit_number=permit_number, domain_info=domain_info, business_type=business_type, customer_service_tel=customer_service_tel, certificate_valid_date=certificate_valid_date, certificate_invalid_date=certificate_invalid_date, file_name=name, company_id=company_id, company_url=detail_url, gmt_created=times, gmt_updated=times) sum.append(zhilian) else: print((permit_number, company_name, domain_info, business_type, customer_service_tel, certificate_valid_date, certificate_invalid_date, name, detail_url)) zhilian = Medicine( company_name=company_name, permit_number=permit_number, domain_info=domain_info, business_type=business_type, customer_service_tel=customer_service_tel, certificate_valid_date=certificate_valid_date, certificate_invalid_date=certificate_invalid_date, file_name=name, company_url=detail_url, gmt_created=times, gmt_updated=times) sum.append(zhilian) if len(sum) == 0: print('本次无更新数据!!!') else: print('本地数据更新了{}条!!!'.format(len(sum))) write_db(sum)
def post_data(sum): object_lists = [] for company_name in sum: headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '332', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'asopSearchUserName=C0E6C927-A7FA-F8E8-197C-339630838038; lastAccessTime=1599630838038; JSESSIONID=5E36F6AD6962754A4EF96AE3E5BBE0BE; lastLoginTime=1599634000115', 'Host': '202.106.121.52:8580', 'Origin': 'http://202.106.121.52:8580', 'Referer': 'http://202.106.121.52:8580/searchweb/query.jsp', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } post_url = "http://202.106.121.52:8580/searchweb/search" data = { 'fullText': company_name, 'pageSize': '10', 'pageNow': '1', 'sortType': '0', 'searchType': '0', 'titleFoldBegin': '-1', 'titleFoldPage': '-1', 'urls': 'zjca.miit.gov.cn/n477169/n477283/' # 关键字段 } response = json.loads( requests.post(post_url, data=data, headers=headers, proxies=proxys[-1], timeout=10).text).get('array') company_id = get_company_id(company_name) if not response: times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print('公司数据目前查询不到!!!') if company_id: zhilian = Medicine(company_name=company_name, gmt_created=times, gmt_updated=times, company_id=company_id) else: zhilian = Medicine(company_name=company_name, gmt_created=times, gmt_updated=times) object_lists.append(zhilian) continue if len(response) > 1: print('{}有多条信息'.format(company_name)) detail_url = [i.get('url') for i in response][0] head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36', 'If-None-Natch': '', 'If-Modified-Since': '' } response = "" for IP in range(20): try: response = requests.request( method='get', url=detail_url, headers=head, proxies=proxys[-1], timeout=10, ) if response.status_code == 200: response = response.content.decode('utf-8') print('获取信息成功!!!') print('break!!!') break except Exception: dl() tree = etree.HTML(response) element_list = tree.xpath('//table[@class="table_biaoge"]/tbody') for ele in element_list: permit_number = ele.xpath('./tr[2]/td[2]/a/text()')[0].strip() company_name = ele.xpath('./tr[3]/td[2]/a/text()')[0].strip() busi_web = ele.xpath('./tr[4]/td[2]/a/text()')[0] if '$content' in busi_web: busi_web = '数据显示错误' busi_type = ele.xpath('./tr[5]/td[2]/a/text()')[0] customer_service_tel = ele.xpath('./tr[6]/td[2]/a/text()')[0] if '$content' in customer_service_tel: customer_service_tel = '数据显示错误' certificate_valid_date = ele.xpath('./tr[7]/td[2]/a/text()')[0] certificate_invalid_date = ele.xpath('./tr[8]/td[2]/a/text()')[0] times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print((permit_number, company_name, busi_web, busi_type, customer_service_tel, certificate_valid_date, certificate_invalid_date, company_id)) if company_id: zhilian = Medicine( company_name=company_name, permit_number=permit_number, busi_web=busi_web, busi_type=busi_type, customer_service_tel=customer_service_tel, certificate_valid_date=certificate_valid_date, certificate_invalid_date=certificate_invalid_date, company_id=company_id, gmt_created=times, gmt_updated=times) else: zhilian = Medicine( company_name=company_name, permit_number=permit_number, busi_web=busi_web, busi_type=busi_type, customer_service_tel=customer_service_tel, certificate_valid_date=certificate_valid_date, certificate_invalid_date=certificate_invalid_date, gmt_created=times, gmt_updated=times) object_lists.append(zhilian) if len(object_lists) == 0: print('本次无更新数据!!!') else: print('本地数据更新了{}条!!!'.format(len(object_lists))) write_db(object_lists)
def main(): bloom = get_updated() sum = [] old_data = [] for page in range(1, 15): url = "http://wlxsba.smda.sh.cn/openApi/getRecordPage?search=false&nd=1600149091958&rows=20&page={}&sidx=&sord=desc&totalrows=2000".format( page) response_web = "" for IP in range(10): try: response_web = requests.request(method='get', url=url, headers=headers, proxies=proxys[-1], timeout=15) # print(response) if response_web.status_code == 200: response_web = response_web.content.decode('utf8') break except Exception: dl() # print(response_web) response = json.loads(response_web) results = response.get('rows') for i in results: detail = i.get('ylqxEnp') if not detail: continue company_name = detail.get('baQymc') legal_people = detail.get('baFddbr') company_principal = detail.get('baQyfzr') home_addr = detail.get('baQyzcdz') business_addr = detail.get('baJycsdz') business_type = detail.get('baJyfs') business_range = detail.get('baJyfwzw') internet_sold_type = '入驻类' social_credit_code = detail.get('enpId') record_num = detail.get('baZsbh') license_num = detail.get('licNo') if record_num and not license_num: business_license_num = record_num elif license_num and not record_num: business_license_num = license_num else: business_license_num = ",".join([record_num, license_num]) id = i.get('riId') response = "" url = 'http://wlxsba.smda.sh.cn/openApi/getRecordDetailData?recordId={}&isOpen=1&_=1600151345630'.format( id) print(url) for IP in range(10): try: response = requests.request(method='get', url=url, headers=headers, proxies=proxys[-1], timeout=15) # print(response) if response.status_code == 200: response = response.content.decode('utf8') break except Exception: dl() response = json.loads(response) flag = response.get('recordInfo').get('ylqxEnpJson') flag = json.loads(flag) if not company_name: company_name = flag.get('baQymc') if not company_name: company_name = flag.get('enpName') if not legal_people: legal_people = flag.get('baFddbr') if not legal_people: legal_people = flag.get('fddbr') if not company_principal: company_principal = flag.get('baQyfzr') if not company_principal: company_principal = flag.get('qyfzr') if not home_addr: home_addr = flag.get('baQyzcdz') if not home_addr: home_addr = flag.get('homeAddr') if not business_addr: business_addr = flag.get('baJycsdz') if not business_addr: business_addr = flag.get('registerAddr') if not business_type: business_type = flag.get('baJyfs') if not business_type: business_type = flag.get('jyfsStr') if not business_range: business_range = flag.get('baJyfwzw') if not business_range: business_range = flag.get('jyfw') if not social_credit_code: social_credit_code = flag.get('enpId') main_business = response.get('boornetParameter').get('paraName') record_date = response.get('recordInfo').get('completeTime') if not record_date: record_date = response.get('recordInfo').get('createtime') other_info = response.get('recordInfo').get('ylqxEnpJson') lis = response.get('enterRecordInfoList') plat_form_info = [] if lis: for i in lis: sing = {} join_platform_name = i.get('svPfName') platform_license_num = i.get('svRecodeNum') plat_form_domain = i.get('wsDomainName') plat_form_shop_add = i.get('wsShopName') sing['join_platform_name'] = join_platform_name sing['platform_license_num'] = platform_license_num sing['plat_form_domain'] = plat_form_domain sing['plat_form_shop_add'] = plat_form_shop_add plat_form_info.append(sing) company_id = get_company_id(company_name) if company_id: print((id, company_name, legal_people, company_principal, home_addr, business_addr, business_type, record_date, internet_sold_type, social_credit_code, business_license_num, main_business, business_range, plat_form_info, other_info, company_id)) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if social_credit_code not in bloom: zhilian = Medicine( record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, home_addr=home_addr, business_addr=business_addr, business_type=business_type, record_date=record_date, internet_sold_type=internet_sold_type, social_credit_code=social_credit_code, business_license_num=business_license_num, main_business=main_business, company_id=company_id, business_range=business_range, plat_form_info=str(plat_form_info), gmt_created=times, gmt_updated=times, other_info=other_info) else: obj_delete = session.query(Medicine).filter( Medicine.social_credit_code == social_credit_code).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, home_addr=home_addr, business_addr=business_addr, business_type=business_type, record_date=record_date, internet_sold_type=internet_sold_type, social_credit_code=social_credit_code, business_license_num=business_license_num, main_business=main_business, business_range=business_range, plat_form_info=str(plat_form_info), gmt_created=times, gmt_updated=times, other_info=other_info, company_id=company_id) sum.append(zhilian) old_data.append(zhilian) else: print((id, company_name, legal_people, company_principal, home_addr, business_addr, business_type, record_date, internet_sold_type, social_credit_code, business_license_num, main_business, business_range, plat_form_info, other_info)) times = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if social_credit_code not in bloom: zhilian = Medicine( record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, home_addr=home_addr, business_addr=business_addr, business_type=business_type, record_date=record_date, internet_sold_type=internet_sold_type, social_credit_code=social_credit_code, business_license_num=business_license_num, main_business=main_business, business_range=business_range, plat_form_info=str(plat_form_info), gmt_created=times, gmt_updated=times, other_info=other_info) sum.append(zhilian) else: obj_delete = session.query(Medicine).filter( Medicine.social_credit_code == social_credit_code).all() for i in obj_delete: session.delete(i) zhilian = Medicine( record_id=id, company_name=company_name, legal_people=legal_people, company_principal=company_principal, home_addr=home_addr, business_addr=business_addr, business_type=business_type, record_date=record_date, internet_sold_type=internet_sold_type, social_credit_code=social_credit_code, business_license_num=business_license_num, main_business=main_business, business_range=business_range, plat_form_info=str(plat_form_info), gmt_created=times, gmt_updated=times, other_info=other_info) sum.append(zhilian) old_data.append(zhilian) time.sleep(1.5) if len(sum) == 0: print('本次无更新数据!!!') else: print('数据库更新数据:{}条,其中旧数据更新:{}条,新增数据:{}条!!!'.format( len(sum), len(old_data), len(sum) - len(old_data))) write_db(sum)