def get_ctitle(html_str, href, file_ad): bsObj = beautiful(html_str, "html.parser") #获取正文标题 try: ctitle = bsObj.find('h1', {'id': 'con_title'}).text except: try: ctitle = bsObj.find('span', {'class': 'titleFont'}).text except: ctitle = None # 获取附件信息,并下载 file_infos = bsObj.find_all( "a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) file_names = [] for each in file_infos: file_href = each['href'] file_adds = file_href.split('.')[-1] file_name = each.text if re.findall(file_adds, file_name): pass else: file_name = file_name + '.' + file_adds if re.findall('http', file_href): pass elif re.findall('/*/', file_href): file_href = 'http://www.gdstc.gov.cn' + file_href else: href_add = href.replace(href.split('/')[-1], '') file_href = href_add + file_href # print(file_href, file_name) file_loc = file_ad + file_name download_file(file_href, file_loc) file_names.append(file_name) return ctitle, file_names
def run(): url = input("desire:\n") # url = 'https://www.deviantart.com/monorirogue/gallery/' page = manipulate.get_url(url) soup = beautiful(page.text, "html.parser") morpher = judger.detect_morpher(url) morpher.download(soup)
def get_ctitle(html_str): #无 bsObj = beautiful(html_str, "html.parser") #获取正文标题 try: ctitle = bsObj.find('h1', {'id': 'con_title'}).text except: try: ctitle = bsObj.find('span', {'class': 'titleFont'}).text except: ctitle = None #获取附件信息,并下载 file_infos = bsObj.find_all( "a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) # print(file_infos) f1 = re.compile('href="(.*?)"') f2 = re.compile('">(.*?)</a>') file_names = [] for each in file_infos: # file_href = each['href'] file_href = re.findall(f1, str(each))[0] file_name = re.findall(f2, str(each))[0] # print(file_href,file_name) if file_name == '': continue if re.findall('http', file_href): pass else: file_href = 'http://www.miit.gov.cn/' + file_href.split('../')[-1] # print(file_href) file_loc = file_ad + file_name download_file(file_href, file_loc) file_names.append(file_name) return ctitle, file_names
def zcjd_url(row,worksheet,url,href_bloom,file_ad,ProgramStarttime): source1 = '工信部' source2 = '政策解读' source3 = '政策解读' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(10, 20)) reqt = urllib.request.Request(url,headers = headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit,'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') lis = soup.find_all('li') for li in lis: a = li.find('a') href = a['href'] complete_href = 'http://www.miit.gov.cn/' + href.split('../')[-1] title = a.text span = li.find('span') date = span.text date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names,img_names,css_names, file_ad) else: print("正在采集:", complete_href) html, chard, html_str = getHtml_quiet(complete_href) html_name = saveHtml(title, html,file_ad) ctitle, file_names,img_names,css_names = get_ctitle(html_str,complete_href,file_ad) #插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, ctitle, html_name, source1, source2, source3, date, ProgramStarttime,complete_href, file_names,img_names,css_names, file_ad) row = row + 1 return row,chref_list
def tztg_url(row, worksheet, url, href_bloom): source1 = '国家科技部' source2 = '通知通告' print("栏目:", source2) chref_list = [] req = requests.get(url) # 获取网页编码格式 response = urllib.request.urlopen(url).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req.encoding = chardit1['encoding'] soup = beautiful(req.text, 'lxml') item_list = soup.find_all('td', {'class': 'STYLE30'}) # print(item_list) for item in item_list: href = re.findall('href="(.*?)"', str(item))[0] title = re.findall('target="_blank">(.*?)</a>', str(item))[0] date = re.findall('</a>\((.*?)\)', str(item))[0] # print(href,title,date) if '../' in href: complete_href = 'http://www.most.gov.cn/' + href.replace('../', '') elif './' in href: complete_href = url + href.replace('./', '') else: complete_href = href # print(complete_href) chref_list.append(complete_href) if complete_href in href_bloom: print("该页面已存在") continue else: #获取静态网页源码 html, chard, html_str = getHtml_quiet(complete_href) #保存为html文件 html_name = saveHtml(title, html) #获取附件(在分页面获取的) file_names = get_file(html_str, complete_href) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, date, complete_href, file_names) row = row + 1 return row, chref_list
def get_file(html_str, href): # print(href) bsObj = beautiful(html_str, "html.parser") #获取附件信息,并下载 file_infos = bsObj.find_all( "a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) file_names = [] for each in file_infos: file_href = each['href'] file_adds = file_href.split('.')[-1] file_name = each.text href_add = href.replace(href.split('/')[-1], '') # print(file_href) if file_name == '': continue if re.findall(file_adds, file_name): pass else: file_name = file_name + '.' + file_adds if re.findall('http', file_href): newfile_href = file_href # print('1:',newfile_href) elif '/u/' in file_href: newfile_href = 'http://service.most.gov.cn' + file_href # print('2:',newfile_href) elif re.findall('/.*?/', file_href): newfile_href = 'http://www.most.gov.cn/' + file_href.replace( '../', '') # print('3:',newfile_href) elif './' in file_href: newfile_href = href_add + file_href.replace('./', '') # print('4:',newfile_href) else: newfile_href = file_href # print('5:',newfile_href) # print(newfile_href,file_name) file_name = file_name.replace('/', '或') while file_name in os.listdir(file_ad): file_name = file_name.rstrip('.' + file_adds) + '~.' + file_adds file_loc = file_ad + file_name download_file(newfile_href, file_loc) file_names.append(file_name) file_diff = sorted(set(file_names), key=file_names.index) return file_diff
def kjzcdt_url(row, worksheet, url, href_bloom): source1 = '国家科技部' source2 = '科技政策动态' print("栏目:", source2) chref_list = [] req = requests.get(url) # 获取网页编码格式 response = urllib.request.urlopen(url).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req.encoding = chardit1['encoding'] soup = beautiful(req.text, 'lxml') item_list = soup.find_all('a', {'target': '_blank'}) # target="_blank" # print(len(item_list)) # print(date_list) for i in range(len(item_list)): item = item_list[i] href = item['href'] title = item.text.split('(')[0] date = item.text.split('(')[-1][:-1] if '../' in href: complete_href = 'http://www.most.gov.cn/' + href.replace('../', '') elif './' in href: complete_href = 'http://www.most.gov.cn/kjzc/kjzcgzdt/' + href.replace( './', '') else: complete_href = href # print(complete_href, title, date) chref_list.append(complete_href) if complete_href in href_bloom: print("该页面已存在") continue else: html, chard, html_str = getHtml_quiet(complete_href) html_name = saveHtml(title, html) # 获取附件(在分页面获取的) file_names = get_file(html_str, complete_href) save_excel(worksheet, row, title, title, html_name, source1, source2, date, complete_href, file_names) row = row + 1 return row, chref_list
def xxgk_url(row, worksheet, url, href_bloom): source1 = '国家科技部' source2 = '政府信息公开' print("栏目:", source2) chref_list = [] req = requests.get(url) # 获取网页编码格式 response = urllib.request.urlopen(url).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req.encoding = chardit1['encoding'] soup = beautiful(req.text, 'lxml') item_list = soup.find_all('a', {'class': 'STYLE30'}) date_list = re.findall('<B>发布日期:</B> (.*?)</td>', req.text) # print(len(item_list)) for i in range(len(item_list)): item = item_list[i] href = item['href'] title = item.text date = date_list[i] if '../' in href: complete_href = 'http://www.most.gov.cn/mostinfo/xinxifenlei/' + href.replace( '../', '') # elif './' in href: # complete_href = 'http://www.most.gov.cn/mostinfo/' + href.replace('./', '') else: complete_href = href # print(complete_href, title, date) chref_list.append(complete_href) if complete_href in href_bloom: print("该页面已存在") continue else: html, chard, html_str = getHtml_quiet(complete_href) html_name = saveHtml(title, html) #获取附件(在分页面获取的) file_names = get_file(html_str, complete_href) save_excel(worksheet, row, title, title, html_name, source1, source2, date, complete_href, file_names) row = row + 1 return row, chref_list
def get_soup(url, skill_dict): soup = None if url == 'http://dice.com/jobs/browsejobs': print_and_log(make_data_frame(skill_dict)) sys.exit() elif url in 'http://simplyhired.comhttps://www.simplyhired': return soup else: print_and_log(f'Getting raw html from: {url}') user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:63.0) Gecko/20100101 Firefox/63.0' session = requests.Session() session.headers.update({'User-Agent': user_agent}) try: response = session.get(url) body = response.text soup = beautiful(body, 'html.parser') print_and_log('Got raw html') except urllib3.exceptions.NewConnectionError as e: print_and_log(e, 'error') write_file( skill_dict, title='new_connection_error_encountered_captured_results') except socket.gaierror as s: print_and_log(s, 'error') write_file(skill_dict, title='socket_error_encountered_captured_results') except socket.error as e: print_and_log(e, 'error') write_file(skill_dict, title='socket_error_encountered_captured_results') except Exception as e: print_and_log(e, 'error') write_file(skill_dict, title='exception_encountered_captured_results') except BaseException as b: print_and_log(b, 'error') write_file(skill_dict, title='exception_encountered_captured_results') return soup
def get_ctitle(html_str,href,file_ad): bsObj = beautiful(html_str, "html.parser") #获取正文标题 try: ctitle = bsObj.find('h1', {'id': 'con_title'}).text except: try: ctitle = bsObj.find('span', {'class': 'titleFont'}).text except: ctitle = None # 获取附件信息,并下载 file_infos = bsObj.find_all("a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) file_names = [] for each in file_infos: file_href = each['href'] file_adds = file_href.split('.')[-1] file_name = each.text href_add = href.replace(href.split('/')[-1], '') # print(file_href) if file_name =='': continue if re.findall(file_adds, file_name): pass else: file_name = file_name + '.' + file_adds if re.findall('http', file_href): newfile_href = file_href # print('1:', newfile_href) elif re.findall('/.*?/', file_href): newfile_href = 'http://www.miit.gov.cn/' + file_href.replace('../', '') # print('3:', newfile_href) elif './' in file_href: newfile_href = href_add + file_href.replace('./', '') # print('4:', newfile_href) else: newfile_href = file_href # print('5:', newfile_href) file_name = file_name.replace('/','或') while file_name in os.listdir(file_ad): file_name = file_name.rstrip('.'+file_adds)+'~.'+file_adds file_loc = file_ad + file_name try: download_file(newfile_href, file_loc) except Exception as e: print("下载附件出现问题:", e) continue file_names.append(file_name) file_diff = sorted(set(file_names), key=file_names.index) # # 获取图片信息,并下载 # img_infos = bsObj.find_all("img", {"src": re.compile(r'.jpg$|.png$')}) # img_names = [] # for each in img_infos: # img_href = each['src'] # # 附件后缀 # img_adds = img_href.split('.')[-1] # img_name = img_href.split('/')[-1] # if re.findall(img_adds, img_name): # pass # else: # img_name = img_name + '.' + img_adds # if re.findall('http', img_href): # pass # elif re.findall('/.*/', img_href): # img_href = 'http://www.miit.gov.cn/' + img_href.replace('../','') # else: # href_add = href.replace(href.split('/')[-1], '') # img_href = href_add + img_href[2:] # print(img_href) # img_loc = file_ad + img_name # try: # download_file(img_href, img_loc) # except Exception as e: # print("下载图片出现问题:", e) # continue # img_names.append(img_name) # 获取css文件信息,并下载 css_infos = bsObj.find_all("link", {"type": "text/css", "href": re.compile(r'.css$')}) css_names = [] for each in css_infos: css_href = each['href'] if '../' in css_href: css_href = '/'+css_href.replace('../','') # 附件后缀 css_adds = css_href.split('.')[-1] css_name = css_href.replace('..', '').replace('/', '_') if re.findall(css_adds, css_name): pass else: css_name = css_name + '.' + css_adds if re.findall('http', css_href): pass elif re.findall('/.*/', css_href): css_href = 'http://www.miit.gov.cn' + css_href else: href_add = href.replace(href.split('/')[-1], '') css_href = href_add + css_href css_loc = file_ad + css_name try: download_file(css_href, css_loc) except Exception as e: print("下载css文件出现问题:", e) continue css_names.append(css_name) img_names = [] return ctitle,file_diff, img_names, css_names
def xxgk_url(row,worksheet,url,href_bloom,file_ad,ProgramStarttime): source1 = '住建部' source2 = '信息公开' source3 = '通知公告' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] time.sleep(random.randint(10,20)) # 获取网页编码格式 reqt = urllib.request.Request(url,headers = headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit,'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') item_list = soup.find_all('tr', {'class': 'item'}) alitem_list = soup.find_all('tr', {'class': 'alitem'}) al_list = item_list + alitem_list for al in al_list: al = str(al).replace('\n', '').replace('\r', '') # 获取完整链接、正文完整标题、正文标题、发布日期 complete_href = re.findall('<a href="(.*?)" message=', al)[0] ctitle = re.findall('&&(.*?)" onmousemove=', al)[0].replace('&', '') title = re.findall('target="_blank">(.*?)</a>', al)[0] date = re.findall('<td>(.*?)</td>', al)[0] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = ctitle + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names,img_names,css_names, file_ad) else: print("正在采集:", complete_href) #获取静态网页源码 html,chard,html_str = getHtml_quiet(complete_href) # 保存为html文件 html_name = saveHtml(title, html,file_ad) #获取附件(在分页面获取的) file_names,img_names,css_names = get_file(html_str,complete_href,file_ad) #插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, ctitle, html_name, source1, source2, source3, date, ProgramStarttime,complete_href, file_names,img_names,css_names, file_ad) row = row + 1 return row,chref_list
def wjfb_url(row,worksheet,url,href_bloom,file_ad,ProgramStarttime): source1 = '住建部' source2 = '政策发布' source3 = '政策法规' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(10,20)) reqt = urllib.request.Request(url,headers = headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit,'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') item_list = soup.find_all('td', {'style': 'text-align:left;'}) date_list = soup.find_all('td', {'style': 'width:86px;text-align:left; color:#ABABAB;'}) for i in range(len(item_list)): item = str(item_list[i]) #获取完整链接、正文标题、发布日期 complete_href = re.findall('<a href="(.*?)"', item)[0] title = re.findall('>(.*?)</a>', item)[0] date = re.findall('>\[(.*?)\]</td>', str(date_list[i]))[0] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names,img_names,css_names, file_ad) else: print("正在采集:", complete_href) #获取静态网页源码 html,chard,html_str = getHtml_quiet(complete_href) # 保存为html文件 html_name = saveHtml(title, html,file_ad) #获取附件(在分页面获取的) file_names,img_names,css_names = get_file(html_str,complete_href,file_ad) #插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime,complete_href, file_names,img_names,css_names, file_ad) row = row + 1 return row,chref_list
except HTTPError e; return none; try: bsObj = Beautifulsoupe(html.read()) title= bsObj. except AttributeError as e: return Title; title = getTitle("http:http://www.codecademy.com/exercise/dif.html") for sublime mathe in client ("tr: in statle->init line func(){@abaction(linea nerat->bower junk->linear) Obj.review'struction_@linea ninks {{link.respose().rewrite(consetent.txt/css;)}} from urllib.request import urlopen from urllib.error import url-beastudul .soupe html=urlopen(http://www/codecademy.com/index-innit.@line-height) bsObj=beautiful(html) try: # find.arr_available_languages # either:behaviror line link s ninishow line showlineshow # lineshowlinebehaiveiro slineshhow lineshowlinebehaiveiro rither: init line ->inisshow line(){ other inicialize show line ():{ benift()->get Title(iniline_ink){ try: if(init line->show line int show ){ return bsObj.beautiful(init linw); init show line shown link shiw show linesho; uner chinese waashiw shiline shil: int inishaolize the law shi ow the init shulie ;
def tztg_url(source1, source2, source3, row, worksheet, url, href_bloom, file_ad, ProgramStarttime): print("栏目:", source2) chref_list = [] time.sleep(random.randint(10, 20)) options = webdriver.ChromeOptions() options.add_argument('--headless') #无显示,后台运行 options.add_argument('--disable-gpu') options.add_argument("window-size=1024,768") options.add_argument("--no-sandbox") # options.add_argument('disable-infobars') driver = webdriver.Chrome('/home/260199/chrome/chromedriver', chrome_options=options) driver.maximize_window() driver.get(url) js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) time.sleep(5) html_str = driver.page_source soup = beautiful(html_str, 'lxml') ul = soup.find('ul', {'class': 'news-group-list onlyTitle'}) #日期,链接,标题,标题 href_list = ul.find_all('li', {'class': None}) for h in href_list: date = h.find_all('div', {'class': 'news-group-date'}) date = re.findall('>(.*?) ', str(date[0]))[0] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace( '日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') href_re = h.a['href'] complete_href = 'http://www.gdcic.gov.cn' + href_re down_href = 'http://www.gdcic.gov.cn' title = h.a.text print(complete_href, title, date) # print(complete_href) chref_list.append(complete_href) if complete_href in href_bloom: continue #若第一层网页即是文件类型 elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 else: print("正在采集:", complete_href) #获取静态网页源码 html, chard, html_str = getHtml_quiet(complete_href) # 保存为html文件 html_name = saveHtml(title, html, file_ad) #获取附件(在分页面获取的) file_names, img_names, css_names = get_file( html_str, complete_href, file_ad) # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 return row, chref_list
def get_file(html_str, href, file_ad): # print(href) bsObj = beautiful(html_str, "html.parser") #获取附件信息,并下载 file_infos = bsObj.find_all( "a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) file_names = [] for each in file_infos: file_href = each['href'] file_adds = file_href.split('.')[-1] file_name = each.text if re.findall(file_adds, file_name): pass else: file_name = file_name + '.' + file_adds if re.findall('http', file_href): pass elif re.findall('/.*/', file_href): file_href = 'http://www.gdcic.gov.cn' + file_href else: href_add = href.replace(href.split('/')[-1], '') file_href = href_add + file_href[2:] print(file_href, file_name) file_name = file_name.replace('/', '或') while file_name in os.listdir(file_ad): file_name = file_name.rstrip('.' + file_adds) + '~.' + file_adds file_loc = file_ad + file_name try: download_file(file_href, file_loc) except Exception as e: print("下载附件出现问题:", e) continue file_names.append(file_name) file_diff = sorted(set(file_names), key=file_names.index) #获取图片信息,并下载 img_infos = bsObj.find_all("img", {"src": re.compile(r'.jpg$|.png$')}) img_names = [] for each in img_infos: img_href = each['src'].replace('\\', '/') #附件后缀 img_adds = img_href.split('.')[-1] img_name = img_href.split('/')[-1] if re.findall(img_adds, img_name): pass else: img_name = img_name + '.' + img_adds if re.findall('http', img_href): pass elif re.findall('/.*/', img_href): img_href = 'http://www.gdcic.gov.cn' + img_href else: href_add = href.replace(href.split('/')[-1], '') img_href = href_add + img_href[2:] print(img_href) img_loc = file_ad + img_name try: download_file(img_href, img_loc) except Exception as e: print("下载图片出现问题:", e) continue img_names.append(img_name) #获取css文件信息,并下载 css_infos = bsObj.find_all("link", { "type": "text/css", "href": re.compile(r'.css$') }) css_names = [] for each in css_infos: css_href = each['href'] #附件后缀 css_adds = css_href.split('.')[-1] css_name = css_href.replace('..', '').replace('/', '_') if re.findall(css_adds, css_name): pass else: css_name = css_name + '.' + css_adds if re.findall('http', css_href): pass elif re.findall('/.*/', css_href): css_href = 'http://www.gdcic.gov.cn/' + css_href.replace('../', '') else: href_add = href.replace(href.split('/')[-1], '') css_href = href_add + css_href print(css_href) css_loc = file_ad + css_name try: download_file(css_href, css_loc) except Exception as e: print("下载css文件出现问题:", e) continue css_names.append(css_name) return file_diff, img_names, css_names
def str_html(scan): text = scan.getvalue() soup = beautiful(text, 'html.parser') text = soup.prettify() return text
def kjzcdt_url(row, worksheet, url, href_bloom, file_ad, ProgramStarttime): source1 = '科技部' source2 = '科技政策动态' source3 = '工作动态' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(10, 20)) reqt = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit, 'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') item_list = soup.find_all('a', {'target': '_blank'}) # target="_blank" # print(len(item_list)) # print(date_list) for i in range(len(item_list)): item = item_list[i] href = item['href'] title = item.text.split('(')[0] date = item.text.split('(')[-1][:-1] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace( '日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') if '../' in href: complete_href = 'http://www.most.gov.cn/' + href.replace('../', '') elif './' in href: complete_href = 'http://www.most.gov.cn/kjzc/kjzcgzdt/' + href.replace( './', '') else: complete_href = href # print(complete_href, title, date) chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) else: print("正在采集:", complete_href) html, chard, html_str = getHtml_quiet(complete_href) html_name = saveHtml(title, html, file_ad) # 获取附件(在分页面获取的) file_names, img_names, css_names = get_file( html_str, complete_href, file_ad) # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 return row, chref_list
def tztg_url(source1,source2,source3,row,worksheet,url,href_bloom,file_ad,ProgramStarttime): print("栏目:", source2) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(15,30)) reqt = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit,'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') ul = soup.find('ul', {'class': 'comlist3'}) #日期,链接,标题,标题 href_list = re.findall( '<a href="(.*?)" title="(.*?)">(.*?)</a><span>(.*?) </span>', str(ul)) for i in range(len(href_list)): date = href_list[i][-1] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') href_re = href_list[i][0] if '../../' in href_re: complete_href = 'http://www.gdcom.gov.cn/' + href_re.replace('../','') elif '../' in href_re: complete_href = url.replace(url.split('/')[-1],'')[:-1].replace(url.split('/')[-2],'')+href_re.replace('../','') elif './' in href_re: complete_href = url.replace(url.split('/')[-1],'')+href_re.replace('./','') else: complete_href = href_re down_href = 'http://www.gdcom.gov.cn' title = href_list[i][-2] print(complete_href, title, date) # print(complete_href) chref_list.append(complete_href) if complete_href in href_bloom: continue #若第一层网页即是文件类型 elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names, css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names,img_names,css_names, file_ad) row = row+1 else: print("正在采集:", complete_href) #获取静态网页源码 html,chard,html_str = getHtml_quiet(complete_href) # 保存为html文件 html_name = saveHtml(title, html,file_ad) #获取附件(在分页面获取的) file_names,img_names,css_names = get_file(html_str,complete_href,file_ad) # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime,complete_href, file_names,img_names,css_names, file_ad) row = row + 1 return row,chref_list
def tztg_url(row, worksheet, url, source1, source2, source3, href_bloom, file_ad, ProgramStarttime): print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(10, 20)) reqt = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit, 'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') tab = soup.find('table', class_='ZIT') href_list = re.findall('<a href="(.*?)">', str(tab)) date_list = re.findall('<td class="ZITI" title="(.*?) ((.*?))">', str(tab)) for i in range(len(href_list)): date = date_list[i][-1] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace( '日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') if href_list[i][0] == '.': complete_href = url + href_list[i][1:] else: complete_href = href_list[i] # down_href = complete_href.rstrip(complete_href.split('/')[-1]) ctitle = date_list[i][0] title = ctitle[:20] + '...' # chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = ctitle + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) else: print("正在采集:", complete_href) #获取静态网页源码 html, chard, html_str = getHtml_quiet(complete_href) #保存为html文件 html_name = saveHtml(title, html, file_ad) #获取附件(在分页面获取的) file_names, img_names, css_names = get_file( html_str, complete_href, file_ad) #插入数据库 insertFile(source1, source2, source3, ctitle, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, ctitle, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 return row, chref_list
def get_ctitle(html_str, href, file_ad): #无使用 bsObj = beautiful(html_str, "html.parser") #获取正文标题 try: ctitle = bsObj.find('h1', {'id': 'con_title'}).text except: try: ctitle = bsObj.find('span', {'class': 'titleFont'}).text except: ctitle = None #获取附件信息,并下载 file_infos = bsObj.find_all( "a", {"href": re.compile(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$')}) # print(file_infos) f1 = re.compile('href="(.*?)"') f2 = re.compile('">(.*?)</a>') file_names = [] for each in file_infos: # file_href = each['href'] file_href = re.findall(f1, str(each))[0] file_name = re.findall(f2, str(each))[0] # print(file_href,file_name) if file_name == '': continue if re.findall('http', file_href): pass else: file_href = 'http://www.mof.gov.cn/' + file_href.split('../')[-1] # print(file_href) file_loc = file_ad + file_name try: download_file(file_href, file_loc) except Exception as e: print("下载附件出现问题:", e) continue file_names.append(file_name) file_diff = sorted(set(file_names), key=file_names.index) # 获取图片信息,并下载 img_infos = bsObj.find_all("img", {"src": re.compile(r'.jpg$|.png$')}) img_names = [] for each in img_infos: img_href = each['src'] # 附件后缀 img_adds = img_href.split('.')[-1] img_name = img_href.split('/')[-1] if re.findall(img_adds, img_name): pass else: img_name = img_name + '.' + img_adds if re.findall('http', img_href): pass elif re.findall('/.*/', img_href): img_href = 'http://www.mof.gov.cn' + img_href else: href_add = href.replace(href.split('/')[-1], '') img_href = href_add + img_href[2:] print(img_href) img_loc = file_ad + img_name try: download_file(img_href, img_loc) except Exception as e: print("下载图片出现问题:", e) continue img_names.append(img_name) # 获取css文件信息,并下载 css_infos = bsObj.find_all("link", { "type": "text/css", "href": re.compile(r'.css$') }) css_names = [] for each in css_infos: css_href = each['href'].replace('../', '') # 附件后缀 css_adds = css_href.split('.')[-1] css_name = css_href.replace('..', '').replace('/', '_') if re.findall(css_adds, css_name): pass else: css_name = css_name + '.' + css_adds if re.findall('http', css_href): pass elif re.findall('/.*/', css_href): css_href = 'http://www.mof.gov.cn/' + css_href else: href_add = href.replace(href.split('/')[-1], '') css_href = href_add + css_href css_loc = file_ad + css_name try: download_file(css_href, css_loc) except Exception as e: print("下载css文件出现问题:", e) continue css_names.append(css_name) return file_diff, img_names, css_names
def tztg_url(source1, source2, source3, row, worksheet, url, href_bloom, file_ad, ProgramStarttime): print("栏目:", source2) chref_list = [] # time.sleep(random.randint(30,60)) # req = requests.get(url,headers = headers) # 获取网页编码格式 time.sleep(random.randint(15, 30)) reqt = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit, 'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'html.parser') ul = soup.find('div', {'class': 'list'}) #日期,链接,标题,标题 href_list = ul.find_all('li') for h in href_list: date = h.find('span').text.strip(' ') # print(date) date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace( '日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') href_re = h.a['href'] if '../../' in href_re: complete_href = 'http://www.zhdpb.gov.cn/' + href_re.replace( '../', '') elif '../' in href_re: complete_href = url.replace(url.split('/')[-1], '')[:-1].replace( url.split('/')[-2], '') + href_re.replace('../', '') elif './' in href_re: complete_href = url.replace(url.split('/')[-1], '') + href_re.replace('./', '') else: complete_href = href_re down_href = 'http://www.zhdpb.gov.cn' title = h.a.text print(complete_href, title, date) # print(complete_href) chref_list.append(complete_href) if complete_href in href_bloom: continue #若第一层网页即是文件类型 elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 else: print("正在采集:", complete_href) try: #获取静态网页源码 html, chard, html_str = getHtml_quiet(complete_href) except Exception as e: print('该页错误:', complete_href, e) continue # 保存为html文件 html_name = saveHtml(title, html, file_ad) #获取附件(在分页面获取的) file_names, img_names, css_names = get_file( html_str, complete_href, file_ad) # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 return row, chref_list
def wjgs_url(row,worksheet,url,href_bloom,file_ad,ProgramStarttime): source1 = '工信部' source2 = '文件公示' source3 = '通知公告' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] #获取网页编码格式 time.sleep(random.randint(10,20)) reqt = urllib.request.Request(url,headers = headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit, 'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') top = soup.find_all('td',valign='top')[1] li_list = top.find_all('li') for li in li_list: li = str(li).replace('\n', '').replace('\r', '') a = re.compile('<a href="(.*?)" target="_blank">(.*?)</a><span><a href="../../(.*?)" target="_blank">(.*?)</a></span></li>') info_list = re.findall(a, li)[0] #获取标题(在主页面显示的) title = info_list[1] #获取分页面url href = info_list[2] complete_href = 'http://www.miit.gov.cn/' + href #获取发布时间 date = info_list[-1] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace('日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names,img_names,css_names, file_ad) else: print("正在采集:", complete_href) #获取动态网页源码 html,html_str = getHtml_move(complete_href) #保存为html文件 html_name = saveHtml(title, html,file_ad) #获取完整标题,附件(在分页面获取的) ctitle,file_names,img_names,css_names = get_ctitle(html_str,complete_href,file_ad) #插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names,img_names,css_names,file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, ctitle, html_name, source1, source2, source3, date, ProgramStarttime,complete_href, file_names,img_names,css_names, file_ad) row = row + 1 return row,chref_list
def tztg_url(row, worksheet, url, href_bloom, file_ad, ProgramStarttime): source1 = '科技部' source2 = '通知通告' source3 = '通知公告' print("网站:", source1 + ' ' + source2 + ' ' + url) chref_list = [] # 获取网页编码格式 time.sleep(random.randint(10, 20)) reqt = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(reqt).read() chardit1 = chardet.detect(response) chardit = chardit1['encoding'] print("编码格式" + chardit) # 获取分页面url req = response.decode(chardit, 'ignore') # req.encoding = chardit1['encoding'] soup = beautiful(req, 'lxml') item_list = soup.find_all('td', {'class': 'STYLE30'}) # print(item_list) for item in item_list: href = re.findall('href="(.*?)"', str(item))[0] title = re.findall('target="_blank">(.*?)</a>', str(item))[0] date = re.findall('</a>\((.*?)\)', str(item))[0] date = date.replace('.', '-').replace('年', '-').replace('月', '-').replace( '日', '').replace('/', '-') date = datetime.datetime.strptime(date, '%Y-%m-%d') # print(href,title,date) if '../' in href: complete_href = 'http://www.most.gov.cn/' + href.replace('../', '') elif './' in href: complete_href = url + href.replace('./', '') else: complete_href = href # print(complete_href) chref_list.append(complete_href) if complete_href in href_bloom: continue elif re.search(r'.doc$|.docx$|.pdf$|.xls$|.xlsx$', complete_href): print("正在采集:", complete_href) href_adds = complete_href.split('.')[-1] title = title + '.' + href_adds title = title.replace('/', '或') html_name = file_ad + title download_file(complete_href, html_name) file_names = [] img_names = [] css_names = [] # 插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) else: print("正在采集:", complete_href) #获取静态网页源码 html, chard, html_str = getHtml_quiet(complete_href) #保存为html文件 html_name = saveHtml(title, html, file_ad) #获取附件(在分页面获取的) file_names, img_names, css_names = get_file( html_str, complete_href, file_ad) #插入数据库 insertFile(source1, source2, source3, title, date, complete_href, ProgramStarttime, html_name, file_names, img_names, css_names, file_ad) href_bloom.update([complete_href]) # 保存到excel表 save_excel(worksheet, row, title, title, html_name, source1, source2, source3, date, ProgramStarttime, complete_href, file_names, img_names, css_names, file_ad) row = row + 1 return row, chref_list