def judgementnoinfo(jsons, varietyinfo): try: # 将字符串转化成json jsons = json.loads(jsons)[0] # 品种来源 varietysource = jsons['VarietySource'] # 品种特征 varietycharacter = jsons['VarietyCharacter'] # 产量表现 outputexpression = jsons['OutputExpression'] # 栽培要求 plantrequirment = jsons['PlantRequirment'] # 审定意见 judgementsuggestion = jsons['JudgementSuggestion'] varietyinfo['varietysource'] = varietysource varietyinfo['varietycharacter'] = varietycharacter varietyinfo['outputexpression'] = outputexpression varietyinfo['plantrequirment'] = plantrequirment varietyinfo['judgementsuggestion'] = judgementsuggestion return varietyinfo except Exception as e: print(e) logger.warning(e) varietyinfo['varietysource'] = '' varietyinfo['varietycharacter'] = '' varietyinfo['outputexpression'] = '' varietyinfo['plantrequirment'] = '' varietyinfo['judgementsuggestion'] = '' return varietyinfo
def get_search(search_url, data): response = requests.get(search_url, allow_redirects=False, headers=headers) try: if response.status_code == 200: search_url = re.search( r'URL=\'(.*?)\'', response.text.encode('utf-8'), re.S) elif response.status_code == 302: search_url = response.headers.get('location') data['search_url'] = search_url # 将整个网页数据写入数据库,中间要加降噪算法处理,这里先放在这里 response = requests.get('https://www.cmeii.com/xingdaoshulei/2418.html', headers=baidu_headers, allow_redirects=False, timeout=1) print(chardet.detect(response.content)) print(response.text.encode('UTF-8-SIG')) # print(extract_article('https://www.cmeii.com/xingdaoshulei/2418.html', response.text)) print('111') if response.status_code == 200: text = requests.get(search_url, headers=headers, allow_redirects=False, timeout=1).text # 降噪,存储数据 data_save(data, text) return search_url, 1 else: return search_url, -1 except Exception as e: logger.warning(e) return search_url, -1
def get_city(): # 获取json形式的编码 try: code_url = 'https://www.zhipin.com/common/data/city.json' r = requests.get(code_url, headers=headers).json() city_list = r['data']['cityList'] code_dict = {} for city in city_list: sub_list = city['subLevelModelList'] for sub in sub_list: code = sub['code'] name = sub['name'] code_dict[name] = code return code_dict except Exception as e: logger.warning(e)
def parse(text): detail_url = 'https://www.zhipin.com' x_res = etree.HTML(text) try: url_xpath = x_res.xpath( '//*[@id="main"]/div/div[@class="job-list"]/ul/li') except Exception as e: logger.warning(e) try: for url in url_xpath: time.sleep(10.1) a = url.xpath('./div/div[@class="info-primary"]/h3/a/@href')[0] url = detail_url + a response = requests.get(url, headers=headers) parse_detail(response.text) except Exception as e: logger.warning(e) print(e)
def to_db(varietyinfo): col_name = "judgementno,cropID,varietyname,judgementregionID,judgementyear,applycompany,istransgenosis,varietyhaslincense," \ "companyhaslincense,hasgrant,haspromotion,varietysource,varietycharacter,outputexpression,plantrequirment," \ "judgementsuggestion,grants,promotion" valuses = "'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'" % \ (varietyinfo['judgementno'], varietyinfo['cropID'], varietyinfo['varietyname'], varietyinfo['judgementregionID'], varietyinfo['judgementyear'], varietyinfo['applycompany'], varietyinfo['istransgenosis'], varietyinfo['varietyhaslincense'], varietyinfo['companyhaslincense'], varietyinfo['hasgrant'], varietyinfo['haspromotion'], varietyinfo['varietysource'], varietyinfo['varietycharacter'], varietyinfo['outputexpression'], varietyinfo['plantrequirment'], varietyinfo['judgementsuggestion'], varietyinfo['grant'], varietyinfo['promotion']) sql = "insert into t_crawl_seed(%s) values(%s);" % (col_name, valuses) try: insertTB(sql) except Exception as e: print('写入数据库失败', e) logger.warning(e)
def get_baidu(search_data, page): print('aaa') start_url = 'https://www.baidu.com/s?wd={}&pn={}'.format(search_data, page) try: text = requests.get(start_url, headers=headers, allow_redirects=False, timeout=1).text except TimeoutError as e: logger.warning(e) x_res = etree.HTML(text) # 存储爬取的数据,字典 data = {} data['search_data'] = search_data for i in range(1, 11): try: search_url = x_res.xpath('//*[@id="{}"]/h3/a/@href'.format(i))[0] title = x_res.xpath('//*[@id="{}"]/h3'.format(i)) title = title[0].xpath( 'string(.)').replace(' ', '').replace('\n', '') data['title'] = title get_search(search_url, data) except IndexError as e: print(e) break
def variety(jsons): jsons = eval(jsons) for i in jsons: licence_info = {} # 许可证号 licence_no = i['LicenceNo'] # 公司名称 apply_company_name = i['ApplyCompanyName'] # 经营范围 production_manage_crops = i['ProductionManageCrops'] # 发证机关 issuing_uthority_caption = i['IssuingAuthorityCaption'] # 发证日期 publish_date = i['PublishDate'] # 有效日期 expire_date = i['ExpireDate'] # 主证 try: main_how = i['MainShow'] # 主证id main_id = main_how.split('id=')[-1] time.sleep(0.3) res = requests.get( 'http://202.127.42.178:4000/SeedSearch/SeedSolution/Business/TempLicenseSelect.ashx', data={ 'Type': 'SLImpLicence', 'LicenceID': main_id }, headers=headers).json() main_info = re.findall("left: 48.5%;'\s?>(.*?)</span>", res['ResultData']) except AttributeError as e: logger.warning(e) continue # 副证 try: deputy_show = i['DeputyShow'] # 副证id deputy_id = deputy_show.split('id=')[-1] deputy_url = 'http://202.127.42.47:8016/TwoLicenceManage/MainLicence/TwoLincenceSubWordBigData.aspx?showall=1&id=' url = deputy_url + deputy_id time.sleep(0.3) res = requests.get(url, headers=headers, data={ 'showall': '1', 'id': deputy_id }) except AttributeError as e: logger.warning(e) continue res_xpath = etree.HTML(res.text) text = res_xpath.xpath('/html/body/div[2]/div/div//text()') # 副证信息 deputy_info = [] for i in text: i = i.replace(' ', '').replace('\r', '').replace('\n', '') if i != '': deputy_info.append(i) # 写入数据库 licence_info['licence_no'] = licence_no licence_info['apply_company_name'] = apply_company_name licence_info['production_manage_crops'] = production_manage_crops licence_info['issuing_uthority_caption'] = issuing_uthority_caption licence_info['publish_date'] = publish_date licence_info['expire_date'] = expire_date licence_info['main_info'] = main_info licence_info['deputy_info'] = deputy_info col_name = "licence_no,apply_company_name,production_manage_crops,issuing_uthority_caption,publish_date,expire_date,main_info,deputy_info" valuses = "'%s','%s','%s','%s','%s','%s','%s','%s'" % \ (licence_info['licence_no'], licence_info['apply_company_name'], licence_info['production_manage_crops'], licence_info['issuing_uthority_caption'], licence_info['publish_date'], licence_info['expire_date'], str(licence_info['main_info']).replace("'", "''"), str(licence_info['deputy_info']).replace("'", "''")) sql = "insert into t_crawl_company_licence(%s) values(%s);" % ( col_name, valuses) try: insertTB(sql) except Exception as e: print('写入数据库失败', e) logger.warning(e)