def down_department_details(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, time): xz_plan_num = xz_real_num = xz_lone_num = sy_plan_num = sy_real_num = sy_lone_num = gq_plan_num = gq_real_num = gq_lone_num = '0' url = get_department_url(base, dwbh) try: response = requests.get(url, timeout=1000, headers=headers) except: get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base, time) return response.encoding = 'utf-8' try: soup = BeautifulSoup(response.text, "html.parser").find( 'div', style="width: 757; height: 582; background-color: #EFF8FF;" ).table.find_all('tr')[2].td.table # soup = BeautifulSoup(response.text, "html.parser").div.table.find_all('tr')[2].td.table except AttributeError: get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base, time) return else: if soup.find_all('tr')[0].find_all('td')[1].span.string is not None: dwmc = soup.find_all('tr')[0].find_all('td')[1].span.string.strip() elif soup.find_all('tr')[0].find_all( 'td')[1].span.b.font.string is not None: dwmc = soup.find_all('tr')[0].find_all( 'td')[1].span.b.font.string.strip() else: dwmc = '' if dwmc == '': department_text('编号:' + dwbh + '-->不存在!') return if soup.find_all('tr')[1].find_all('td')[1].string is not None: qtmc = soup.find_all('tr')[1].find_all('td')[1].string.strip() else: qtmc = '' if qtmc == "无": qtmc = '' if soup.find_all('tr')[2].find_all('td')[1].span.string is not None: ldzs = soup.find_all('tr')[2].find_all('td')[1].span.string.strip() else: ldzs = '' if soup.find_all('tr')[2].find_all('td')[3].span.string is None: jb = '' else: try: soup.find_all('tr')[2].find_all('td')[3].span.b.font.string except AttributeError: jb = soup.find_all('tr')[2].find_all( 'td')[3].span.string.strip() else: if soup.find_all('tr')[2].find_all( 'td')[3].span.b.font.string is not None: jb = soup.find_all('tr')[2].find_all( 'td')[3].span.b.font.string.strip() else: jb = '' if soup.find_all(id="lblNeiSheJG")[0].string is not None: nsjg = soup.find_all(id="lblNeiSheJG")[0].string.strip() else: nsjg = '' if nsjg == "\'": nsjg = '' # 有一行的情况 if soup.find_all(id="lblMainDuty")[0].string is not None: zyzz = soup.find_all(id="lblMainDuty")[0].string.strip() else: # 获取单位的主要职责:大部分主要职责似乎是延迟加载,正常的方式抓取不到,需要借助浏览器 # browser = webdriver.Chrome("c:\\chromedriver.exe") # browser.get(url) # rt = browser.page_source # browser.close() # zyzz = BeautifulSoup(rt, "html.parser").find_all(id="lblMainDuty")[0].get_text() zyzz = '' if zyzz == "\'": zyzz = '' if soup.find_all('tr')[4].td.div.table is not None: number = soup.find_all('tr')[4].td.div.table.find_all('tr') for num in number: if num.find_all('td')[0].string.strip().find("行政编制数") != -1: if num.find_all('td')[1].font is not None: if num.find_all('td')[1].font.string.strip( ) == " " or num.find_all( 'td')[1].font.string.strip() == "": xz_plan_num = "0" else: xz_plan_num = num.find_all( 'td')[1].font.string.strip() else: if num.find_all('td')[1].string.strip( ) == " " or num.find_all( 'td')[1].string.strip() == "": xz_plan_num = "0" else: xz_plan_num = num.find_all('td')[1].string.strip() if num.find_all('td')[3].a.string.strip() == " ": xz_real_num = "0" xz_lone_num = "0" else: if len(num.find_all('td')[3].find_all('a')) == 1: xz_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() xz_lone_num = "0" else: xz_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() xz_lone_num = num.find_all('td')[3].find_all( 'a')[1].string.strip() elif num.find_all('td')[0].string.strip().find("事业编制数") != -1: if num.find_all('td')[1].font is not None: if num.find_all('td')[1].font.string.strip( ) == " " or num.find_all( 'td')[1].font.string.strip() == "": sy_plan_num = "0" else: sy_plan_num = num.find_all( 'td')[1].font.string.strip() else: if num.find_all('td')[1].string.strip( ) == " " or num.find_all( 'td')[1].string.strip() == "": sy_plan_num = "0" else: sy_plan_num = num.find_all('td')[1].string.strip() if num.find_all('td')[3].a.string.strip() == " ": sy_real_num = "0" sy_lone_num = "0" else: if len(num.find_all('td')[3].find_all('a')) == 1: sy_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() sy_lone_num = "0" else: sy_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() sy_lone_num = num.find_all('td')[3].find_all( 'a')[1].string.strip() elif num.find_all('td')[0].string.strip().find("工勤编制数") != -1: if num.find_all('td')[1].font is not None: if num.find_all('td')[1].font.string.strip( ) == " " or num.find_all( 'td')[1].font.string.strip() == "": gq_plan_num = "0" else: gq_plan_num = num.find_all( 'td')[1].font.string.strip() else: if num.find_all('td')[1].string.strip( ) == " " or num.find_all( 'td')[1].string.strip() == "": gq_plan_num = "0" else: gq_plan_num = num.find_all('td')[1].string.strip() if num.find_all('td')[3].a.string.strip() == " ": gq_real_num = "0" gq_lone_num = "0" else: if len(num.find_all('td')[3].find_all('a')) == 1: gq_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() gq_lone_num = "0" else: gq_real_num = num.find_all('td')[3].find_all( 'a')[0].string.strip() gq_lone_num = num.find_all('td')[3].find_all( 'a')[1].string.strip() else: pass lx = re.search(re.compile(r'BZLX=.+?$'), num.find_all('td')[3].a['href']).group(0) bzlx = lx[5:len(lx)] down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, bzlx) save_department( get_department(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, qtmc, ldzs, jb, nsjg, zyzz, xz_plan_num, xz_real_num, xz_lone_num, sy_plan_num, sy_real_num, sy_lone_num, gq_plan_num, gq_real_num, gq_lone_num, url, time)) else: department_text(dwzd + ':' + dwbh + '-' + dwmc + '-' + '--->无编制人员!')
def down_department_details(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, time): xz_plan_num = xz_real_num = xz_lone_num = sy_plan_num = sy_real_num = sy_lone_num = gq_plan_num = gq_real_num = gq_lone_num = '0' url = get_department_url(base, dwbh) try: response = requests.get(url, timeout=1000, headers=headers) except: get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base, time) return response.encoding = 'utf-8' try: soup = BeautifulSoup( response.text, "html.parser").div.table.find_all('tr')[2].td.table except AttributeError: get_department_err(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, base, time) return else: if soup.find_all('tr')[0].find_all('td')[1].span.string is not None: dwmc = soup.find_all('tr')[0].find_all('td')[1].span.string.strip() elif soup.find_all('tr')[0].find_all( 'td')[1].span.b.font.string is not None: dwmc = soup.find_all('tr')[0].find_all( 'td')[1].span.b.font.string.strip() else: dwmc = '' if dwmc == '': department_text('编号:' + dwbh + '-->不存在!') return if soup.find_all('tr')[1].find_all('td')[1].string is not None: qtmc = soup.find_all('tr')[1].find_all('td')[1].string.strip() else: qtmc = '' if qtmc == "无": qtmc = '' if soup.find_all('tr')[2].find_all('td')[1].string is not None: ldzs = soup.find_all('tr')[2].find_all('td')[1].string.strip() else: ldzs = '' if soup.find_all('tr')[2].find_all('td')[3].span.string is None: jb = '' else: try: soup.find_all('tr')[2].find_all('td')[3].span.b.font.string except AttributeError: jb = soup.find_all('tr')[2].find_all( 'td')[3].span.string.strip() else: if soup.find_all('tr')[2].find_all( 'td')[3].span.b.font.string is not None: jb = soup.find_all('tr')[2].find_all( 'td')[3].span.b.font.string.strip() else: jb = '' if soup.find_all(id="lblNeiSheJG")[0].string is not None: nsjg = soup.find_all(id="lblNeiSheJG")[0].string.strip() else: nsjg = '' if nsjg == "\'": nsjg = '' # 有一行的情况 if soup.find_all(id="lblMainDuty")[0].string is not None: zyzz = soup.find_all(id="lblMainDuty")[0].string.strip() else: # 获取单位的主要职责:大部分主要职责似乎是延迟加载,正常的方式抓取不到,需要借助浏览器 # browser = webdriver.Chrome("c:\\chromedriver.exe") # browser.get(url) # rt = browser.page_source # browser.close() # zyzz = BeautifulSoup(rt, "html.parser").find_all(id="lblMainDuty")[0].get_text() zyzz = '' if zyzz == "\'": zyzz = '' if soup.find_all(id="LabelXZ") == []: xz_plan_num = "0" else: xz_plan_num = soup.find_all(id="LabelXZ")[0].get_text() if soup.find_all(id="RealXZ") == []: xz_real_num = "0" xz_lone_num = "0" else: xz = soup.find_all(id="RealXZ")[0].get_text() if '(' in xz: xz_real_num = xz.split("(")[0] xz_lone_num = xz.split("(")[1][3:-2] else: xz_real_num = xz xz_lone_num = "0" xz_lx = re.search(re.compile(r'BZLX=.+?$'), soup.find_all(id="RealXZ")[0].a['href']).group(0) xz_bzlx = xz_lx[5:len(xz_lx)] down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, xz_bzlx) if soup.find_all(id="LabelSY") == []: sy_plan_num = "0" else: sy_plan_num = soup.find_all(id="LabelSY")[0].get_text() if soup.find_all(id="RealSY") == []: sy_real_num = "0" sy_lone_num = "0" else: sy = soup.find_all(id="RealSY")[0].get_text() if '(' in sy: sy_real_num = xz.split("(")[0] sy_lone_num = xz.split("(")[1][3:-2] else: sy_real_num = sy sy_lone_num = "0" sy_lx = re.search(re.compile(r'BZLX=.+?$'), soup.find_all(id="RealSY")[0].a['href']).group(0) sy_bzlx = sy_lx[5:len(sy_lx)] down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, sy_bzlx) if soup.find_all(id="LabelGQ") == []: gq_plan_num = "0" else: gq_plan_num = soup.find_all(id="LabelGQ")[0].get_text() if soup.find_all(id="RealGQ") == []: gq_real_num = "0" gq_lone_num = "0" else: gq = soup.find_all(id="RealGQ")[0].get_text() if '(' in gq: gq_real_num = xz.split("(")[0] gq_lone_num = xz.split("(")[1][3:-2] else: gq_real_num = gq gq_lone_num = "0" gq_lx = re.search(re.compile(r'BZLX=.+?$'), soup.find_all(id="RealGQ")[0].a['href']).group(0) gq_bzlx = gq_lx[5:len(gq_lx)] down_person_list(base, szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, gq_bzlx) save_department( get_department(szcs, dwzd, dwlb, dwlx, sjdw, dwbh, dwmc, qtmc, ldzs, jb, nsjg, zyzz, xz_plan_num, xz_real_num, xz_lone_num, sy_plan_num, sy_real_num, sy_lone_num, gq_plan_num, gq_real_num, gq_lone_num, url, time))