def parse_requests(links): search_urls = requests.get( links).text # source of link from overridden 'parse" method html_link_parent = bb(search_urls).findAll('h3') for link_title in html_link_parent: link_title = link_title.find_next() url_strip = link_title.get('href') if '/url?q=' in url_strip and 'stackoverflow.com/questions/tagged' in url_strip: # checks to see if the url points to the specific page. If not, pass pass elif '/url?q=' in url_strip and 'stackoverflow.com/questions/' in url_strip: url_strip = url_strip[7:] if 'webcache' not in url_strip: # filters that are not desired. Legit links have the same url namespacing req = requests.get(url_strip).text soup = bb(req).find( 'div', {'class': 'container' }) # Section containging question text for a for link_title in soup: title = soup.find('a', { 'class': 'question-hyperlink' }).text title_body_dict = soup.find('div', { 'class': 'post-text' }).text time.sleep( 3) # wait before requests extractions the next site. return dict.update(title, title_body_dict)
def getHref(cityList): global sum sum = 0 header = [{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' }, {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'}] global fw fw = open('../fo.json', 'w', encoding='utf-8') for provinceItem in cityList.keys(): for cityItem in cityList[provinceItem].keys(): cityCode = cityList[provinceItem][cityItem] # 城市对应的编码 try: href = urllib.request.Request( "https://search.51job.com/list/"+cityCode+",000000,0000,00,9,99,%2520,2,1.html",headers=header[random.randint(0,1)]) b = bb(urllib.request.urlopen(href).read(), 'lxml') total_page = b.find('div', {'class': 'p_in'}).find('span', {'class': 'td'}).text # 获取每个城市的总页数 total_page=int(re.findall(r"\d+\.?\d*", total_page)[0]) for i in range(total_page): href = urllib.request.Request( "https://search.51job.com/list/"+cityCode+",000000,0000,00,9,99,%2520,2,"+"%d.html"%(i),headers=header[random.randint(0,1)]) b = bb(urllib.request.urlopen(href).read(), 'lxml') items = b.findAll('p', {'class', 't1'}) for item in items: href = urllib.request.Request(item.find('span').find('a')['href'], headers=header[random.randint(0,1)]) Thread(target=getInformation, args=( bb(urllib.request.urlopen(href).read(), 'lxml'),provinceItem,cityItem,sum)).start() sum += 1 sleep(2) #每获取一页数据就休眠2秒 except Exception as e: print("1:%s"%e) continue fw.close() dataFormat("../fo.json")
def site2(): s = 1 q = '' e = {} j = '' t = 0 test = 9 backdir9 = os.getcwd() fine = backdir9 + '\\Data\\name.txt' fine1 = backdir9 + '\\Data\\count.txt' with open(fine, 'r+') as (f): f.truncate() with open(fine1, 'r+') as (f): f.truncate() headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' } popup.lead2() with open(fine, 'r+') as (f): j = f.read() f.seek(0) f.truncate() book = j name = book.replace(' ', '+') web = 'http://www.ebook777.com' r = requests.get(('http://www.ebook777.com/?s=' + name), headers=headers) soup = bb(r.text, 'lxml') match = soup.findAll('div', class_='content') x = 1 for a in match: g = a.find('a', {'class': 'title'}).text o = a.find('a', {'class': 'title'})['href'] e[g] = o with open(fine, 'r+') as (f): for a in e.keys(): m = str(x) f.write(m + '. ' + a + '\n\n') x += 1 os.startfile(fine) popup.lead3() with open(fine1, 'r+') as (f): t = int(f.readline()) f.truncate() for d in e.keys(): if s == t: q = d test = 10 s += 1 if test == 10: new = e[q] newr = requests.get(new, headers) soup1 = bb(newr.text, 'lxml') match1 = soup1.find('span', class_='download-links') match2 = match1.find('a')['href'] wb.open_new(match2)
def getHref(cityList): global sum sum = 0 header = [{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' }, { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }] global fw fw = open('../leiPin.json', 'w', encoding='utf-8') for provinceItem in cityList.keys(): #每一个省份 for cityItem in cityList[provinceItem].keys(): #每一个城市 cityCode = cityList[provinceItem][cityItem] #城市对应的编码 nextPage = "https://www.liepin.com/zhaopin/?dqs=%s&curPage=1" % ( cityCode) while (True): #遍历每一页 try: # href = urllib.request.Request(("https://www.liepin.com/zhaopin/?dqs=%s&curPage=%s")%(cityCode,i+1),headers=header[random.randint(0,1)]) # 伪装浏览器 href = urllib.request.Request( nextPage, headers=header[random.randint(0, 1)]) # 伪装浏览器 b = bb(urllib.request.urlopen(href).read(), 'lxml') items = b.find('ul', { 'class': 'sojob-list' }).findAll('li') # 每一页对应的工作列表 for item in items: href = urllib.request.Request( item.find('div', { 'class': 'job-info' }).find('h3').find('a')['href'], headers=header[random.randint(0, 1)]) Thread(target=getInformation, args=(bb( urllib.request.urlopen(href).read(), 'lxml'), provinceItem, cityItem, sum)).start() sum += 1 nextPage = "https://www.liepin.com/" + b.find( 'div', { 'class': 'pagerbar' }).findAll('a')[-3]['href'] except Exception as e: if (str(e) == 'HTTP Error 404: Not Found'): break print("1:%s" % e) continue
def getHref(cityList): global sum, fw, proxy_list sum = 0 header = [{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' }, { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }] fw = open('../fe.json', 'w', encoding='utf-8') for provinceItem in cityList.keys(): for cityItem in cityList[provinceItem].keys(): cityCode = cityList[provinceItem][cityItem] # 城市对应的编码 for careerItem in career.keys(): # 每种职业对应的编码加进去 try: # href = urllib.request.Request(("https://%s.58.com/%s/") % (cityCode,career[careerItem]),headers=header[random.randint(0,1)]) # b = bb(urllib.request.urlopen(href).read(), 'lxml') # total_page = int(b.find('span', {'class': 'num_operate'}).find('i', {'class': 'total_page'}).text) # 获取每个城市的总页数 for i in range(1): href = requests.get( ("https://%s.58.com/%s/pn%s") % (cityCode, career[careerItem], i), headers=header[random.randint(0, 1)]) b = bb(href.text, 'lxml') items = b.findAll('li', {'class', 'job_item clearfix'}) for item in items: # href = urllib.request.Request(item.find('div',{'class':'job_name clearfix'}).find('a')['href'], headers=header[random.randint(0,1)]) href = requests.get( item.find('div', { 'class': 'job_name clearfix' }).find('a')['href'], headers=header[random.randint(0, 1)]) # Thread(target=getInformation, args=( # bb(urllib.request.urlopen(href).read(), 'lxml'),provinceItem,cityItem,sum)).start() Thread(target=getInformation, args=(bb(href.text, 'lxml'), provinceItem, cityItem, sum)).start() sum += 1 sleep(2) #每获取一页数据就休眠2秒 except Exception as e: print("1:%s" % e) continue
def getHref(cityList): global sum sum = 0 header = [{ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/68.0.3440.84 Safari/537.36' }, { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }] global fw fw = open('../boss.json', 'w', encoding='utf-8') for provinceItem in cityList.keys(): #每一个省份 for cityItem in cityList[provinceItem].keys(): #每一个城市 cityCode = cityList[provinceItem][cityItem] #城市对应的编码 for i in range(5): #遍历每一页 try: href = urllib.request.Request( ("https://www.zhipin.com/c%s/?page=%s") % (cityCode, i + 1), headers=header[random.randint(0, 1)]) #伪装浏览器 b = bb(urllib.request.urlopen(href).read(), 'lxml') items = b.find('div', { 'class': 'job-list' }).findAll('li') #每一页对应的工作列表 for item in items: href = urllib.request.Request( "https://www.zhipin.com" + item.find('div', { 'class': 'info-primary' }).find('h3').find('a')['href'], headers=header[random.randint(0, 1)]) Thread(target=getInformation, args=(bb( urllib.request.urlopen(href).read(), 'lxml'), provinceItem, cityItem, sum)).start() sum += 1 except Exception as e: print("1:%s" % e) continue
def get_nav_links(index_url): # 从引导页获得该页面中的所有相册链接 html = requests.get(index_url, headers=headers) html.encoding = 'gbk' soup = bb(html.text, 'lxml') neirong = soup.find_all('h2') print(neirong) with open(os.path.join(FIG_BASE, "page_urls.txt"), 'a') as f: for i in neirong: print(i.a.get("href"), i.text) link = f"{main_domain}/" + i.a.get("href") f.write(f"{link} {i.text} \n") get_figs(link)
def get_nav_links(index_url): # 从引导页获得该页面中的所有相册链接 html = requests.get(index_url, headers=headers) html.encoding = 'gbk' soup = bb(html.text, 'lxml') neirong = soup.find_all('h2') print(neirong) with open("page_urls.txt", 'a') as f: for i in neirong: print(i.a.get("href"), i.text) link = "https://qqh225.com/" + i.a.get("href") f.write(f"{link} {i.text} \n") get_figs(link)
def get_figs(page_url): for page_i in range(1, 26): if page_i == 1: multi_page = page_url else: multi_page = "{}_{}.html".format( page_url.split(".html")[0], page_i) page = get_response(multi_page, useproxy=False, retry_counter=10) if page.url == f"{main_domain}/cip.asp": break page.encoding = 'gbk' soup = bb(page.text, 'lxml') dir_name_patten = re.search(r'([\S*\]*]\S+\s*\S+[\]])(_*)(\W*)', soup.title.text) # 从title提取相册名 if dir_name_patten: dir_name = dir_name_patten.group(1) else: dir_name = soup.title.text dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name) # 验证是否包含不合法字符,并替换 xiangce_dir = os.path.join(FIG_BASE, dir_name) if not os.path.exists(xiangce_dir): os.mkdir(xiangce_dir) neirong = soup.find_all('p') with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f: for i in tqdm(neirong): # tqdm 来展示当前页面下载进度 try: # print(i.img.get("src")) fig_url = i.img.get("src") # 增加判断本地文件是否存在,存在则跳过 fig_path = os.path.join(xiangce_dir, f'{fig_url.split("/")[-1]}') if os.path.exists(fig_path): continue else: with open(fig_path, 'wb') as fig_f: fig_f.write( get_response(fig_url, useproxy=False, retry_counter=3).content) f.write(f"{fig_url} \n") except requests.Timeout: continue except AttributeError: # 第一个p标签存着人物介绍 with open(os.path.join(xiangce_dir, "profile.txt"), "a", encoding="utf-8") as profile_f: profile_f.writelines(i.text) continue
def get_figs(page_url): for page_i in range(1, 26): if page_i == 1: multi_page = page_url else: multi_page = "{}_{}.html".format( page_url.split(".html")[0], page_i) # page = requests.get(multi_page, headers=headers) page = get_response(multi_page, useproxy=False, retry_counter=10) page.encoding = 'gbk' soup = bb(page.text, 'lxml') dir_name = soup.title.text.split("P]")[0] dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name) # 验证是否包含不合法字符,并替换 if page.url == f"{main_domain}/cip.asp": break xiangce_dir = os.path.join(FIG_BASE, dir_name) if not os.path.exists(xiangce_dir): os.mkdir(xiangce_dir) neirong = soup.find_all('p') # print(neirong) with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f: for i in neirong: try: print(i.img.get("src")) fig_url = i.img.get("src") # 增加判断本地文件是否存在,存在则跳过 fig_path = os.path.join(xiangce_dir, f'{fig_url.split("/")[-1]}') if os.path.exists(fig_path): continue else: socket.setdefaulttimeout(300) opener = urllib.request.build_opener() opener.addheaders = [ ('User-Agent', UserAgent(verify_ssl=False).chrome) ] urllib.request.install_opener(opener) urllib.request.urlretrieve(fig_url, fig_path, progress) f.write(fig_url) f.write("\r\n") except: continue
def get_figs(page_url): for page_i in range(1, 26): if page_i == 1: multi_page = page_url else: multi_page = "{}_{}.html".format( page_url.split(".html")[0], page_i) page = requests.get(multi_page, headers=headers) page.encoding = 'gbk' soup = bb(page.text, 'lxml') dir_name = soup.title.text.split("P]")[0] dir_name = re.sub(r'[/:*?"<>|]', '-', dir_name) # 验证是否包含不合法字符,并替换 if page.url == "https://qqi668.com/cip.asp": break xiangce_dir = os.path.join(os.getcwd(), dir_name) if not os.path.exists(xiangce_dir): os.mkdir(xiangce_dir) neirong = soup.find_all('p') # print(neirong) with open(os.path.join(xiangce_dir, "fig_urls.txt"), 'a') as f: for i in neirong: try: print(i.img.get("src")) fig_url = i.img.get("src") # 增加判断本地文件是否存在,存在则跳过 fig_path = os.path.join(xiangce_dir, f'{fig_url.split("/")[-1]}') if os.path.exists(fig_path): continue else: socket.setdefaulttimeout(300) opener = urllib.request.build_opener() opener.addheaders = [( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36' )] urllib.request.install_opener(opener) urllib.request.urlretrieve(fig_url, fig_path, progress) f.write(fig_url) f.write("\r\n") except: continue
def site1(): j = '' e = {} p = [] t = 0 s = 1 q = '' backdir9 = os.getcwd() fine = backdir9 + '\\Data\\name.txt' fine1 = backdir9 + '\\Data\\count.txt' with open(fine, 'r+') as (f): f.truncate() with open(fine1, 'r+') as (f): f.truncate() popup.lead2() with open(fine, 'r+') as (f): j = f.read() f.seek(0) f.truncate() web = 'http://gen.lib.rus.ec/search.php?req=' web1 = 'http://gen.lib.rus.ec/' web2 = 'http://93.174.95.29' headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' } book = j name = book.replace(' ', '+') r = requests.get((web + name), headers=headers) soup = bb(r.text, 'lxml') match = soup.findAll('td', width='500') x = 1 for d in match: g = d.a.text o = d.a['href'] e[g] = o with io.open(fine, 'r+', encoding='utf-8') as (f): for a in e.keys(): m = str(x) f.write(m + '. ' + a + '\n\n') x += 1 os.startfile(fine) popup.lead3() with open(fine1, 'r+') as (f): t = int(f.readline()) f.truncate() for d in e.keys(): if s == t: q = d s += 1 new = e[q] new2 = web1 + new new3 = requests.get(new2, headers=headers) soup1 = bb(new3.text, 'lxml') match1 = soup1.find('a', {'title': 'Gen.lib.rus.ec'})['href'] new4 = requests.get(match1, headers=headers) soup2 = bb(new4.text, 'lxml') latest = web2 + soup2.h2.a['href'] wb.open_new(latest)
p = predict_image(cv2.imread("E:\system\Pictures\ml project\{}".format(lki))) print(categories[p]) # In[131]: from bs4 import BeautifulSoup as bb import requests as rr # In[132]: res = rr.get("https://en.wikipedia.org/wiki/{}".format(categories[p])) # In[133]: bes = bb(res.text, 'lxml') # In[134]: lin = bes.find('table', class_='infobox biota') # In[135]: logo = lin.find("a", class_="image") tree = logo.img.get('src') # In[136]: from IPython.display import Image from IPython.core.display import HTML print("The Image of Tree is :")