async def active_name_generator(): a = True i = 1 links = [] while a: async with aiohttp.ClientSession() as session: async with session.get( f"https://www.edsm.net/en/expeditions/p/{i}") as response: async with session.get( f"https://www.edsm.net/en/expeditions/p/{i - 1}" ) as pre_response: if await response.text() == await pre_response.text( ) or i == 1: html = Bs4(await response.text(), "html.parser") for link in html.find_all("a"): formated = link.get("href") if formated is not None: if "/en/expeditions/summary/" in formated: if formated not in links: links.append(formated) i += 1 else: a = False ongoing = [] for link in links: async with aiohttp.ClientSession() as session: async with session.get(f"https://www.edsm.net/{link}") as response: raw_html = await response.text() html = Bs4(raw_html, "html.parser") if "This expedition is finished." not in html.get_text(): ongoing.append(link) names = {} for link in ongoing: expedition = link[link.find("name") + 5:] expedition_clean = unquote(expedition).replace("+", " ") link = link.replace("summary", "participants") async with aiohttp.ClientSession() as session: async with session.get(f"https://www.edsm.net/{link}") as response: raw_html = await response.text() html = Bs4(raw_html, "html.parser") for text in html.find_all("a"): formated = text.get("href") if formated is not None: if "/en/user/profile/" in formated: spot = formated.find("cmdr") name = formated[spot + 5:] name_clean = unquote(name).replace("+", " ") if name_clean not in names: names[name_clean] = [] names[name_clean].append(expedition_clean) return names
def get_next_url(urllist): url_list = [] for url in urllist: response = requests.get(url, headers=headers) soup = Bs4(response.text, "html.parser") urls = soup.find_all("a") if urls: for url2 in urls: url2_1 = url2.get("href") if url2_1: if url2_1[0] == "/": url2_1 = head_url + url2_1 url_list.append(url2_1) if url2_1[ 0:24] == "http://192.168.6.27:6030/portals/hd": url2_1 = url2_1 url_list.append(url2_1) else: pass else: pass else: pass else: pass url_list2 = set(url_list) for url_ in url_list2: res = requests.get(url_) if res.status_code == 200: print(url_) print(len(url_list2)) get_next_url(url_list2)
def get_next_url(urllist): url_list = [] for url in urllist: response = requests.get(url, headers=headers) soup = Bs4(response.text, "lxml") urls = soup.find_all("a") if urls: for url2 in urls: url2_1 = url2.get("href") if url2_1: if url2_1[0] == "/": url2_1 = head_url + url2_1 url_list.append(url2_1) if url2_1[0:24] == "http://www.xxx.com.cn": url2_1 = url2_1 url_list.append(url2_1) else: pass else: pass else: pass else: pass url_list2 = set(url_list) for url_ in url_list2: res = requests.get(url_) if res.status_code == 200: print(url_) print(len(url_list2)) get_next_url(url_list2)
def get_title_link(item_url): """将数据页面的url记录""" links_file = open('links.txt', 'a') # data_list = [] html = requests.get(item_url, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') # 拿到尾页数据 aa_list = soup.select('a') useable_a = [] for aa in aa_list: if aa.get('href') and aa.get('href').startswith('/news/?page='): useable_a.append(aa.get('href')) if len(useable_a) == 0: return i_need = useable_a[len(useable_a) - 1] pages = int(i_need[12:i_need.find('&')]) params = i_need[i_need.find('&'):] # 第一页的数据 links_file.write(item_url + '\n') # get_item(data_list, item_url) # 其他页数据 for i in range(2, pages + 1): uri = 'https://grain.sci99.com/news/?page=' + str(i) + params print(uri) links_file.write(uri + '\n') # get_item(data_list, uri) links_file.close()
def main(): global response photo_dir_name = keyword.replace(' ', '-') + '-photos' chrome.get(f'https://www.google.com/imghp?hl=en&q={keyword}') chrome.find_element_by_css_selector('input[name=q]').send_keys(Keys.ENTER) scroll_pause_time = 0.10 scroll_length = 200 scroll_position = 0 for _ in range(scroll_to_bottom_number): time.sleep(1.5) page_height = int( chrome.execute_script('return document.body.scrollHeight')) while scroll_position < page_height: scroll_position = scroll_position + scroll_length chrome.execute_script('window.scrollTo(0, ' + str(scroll_position) + ');') time.sleep(scroll_pause_time) time.sleep(1.5) source = chrome.page_source chrome.close() soup = Bs4(source, 'html.parser') photos = [ photo for photo in soup.find(attrs={ 'id': 'islmp' }).find_all('img') ] if not os.path.exists(photo_dir_name): os.mkdir(photo_dir_name) os.chdir(photo_dir_name) with open('sources.txt', 'a') as sources: for x, photo in enumerate(photos): try: key = 'src' if 'src' in photo.attrs else 'data-src' if photo[key].startswith('data'): mime_type = re.search(':(.+);', photo[key]).group(1).split('/')[1] else: sources.write(photo[key] + '\n') response = requests.get(photo[key], stream=True) if not response.ok: continue mime_type = response.headers['content-type'].split('/')[1] with open(f'{keyword.replace(" ", "-")}-{x}.{mime_type}', 'wb') as handle: if photo[key].startswith('data'): handle.write( base64.decodebytes(photo[key].split('base64,') [1].encode('unicode_escape'))) else: for block in response.iter_content(1024): if not block: break handle.write(block) except Exception as e2: print(f'C\'è stato un errore: {e2}') continue os.chdir('..') shutil.make_archive(photo_dir_name, 'zip', photo_dir_name) shutil.rmtree(photo_dir_name) print('Fatto!')
def get_banner_page_link(): file = open('banner_link.txt', 'a') for i in range(1, 82): html = requests.get(head_url + '/page/' + str(i), headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') a_list = soup('a', attrs={'class': 'item-href'}) for a in a_list: print(a.get('href')) file.write(a.get('href') + '\n')
def parser_notice(self, html): soup = Bs4(html, 'html.parser') notices = soup.select( '#content > div:nth-of-type(1) > div:nth-of-type(1) > div > table > tbody > tr > td.board-title > span') pks = soup.select('#content > div:nth-of-type(1) > div:nth-of-type(1) > div > table > tbody > tr') for index, pk in enumerate(pks): title = notices[index].text url = 'https://cafe.bithumb.com/view/board-contents/' + re.findall("'([a-zA-Z0-9,\s]*)'", pk['onclick'])[0] self.current_notice_list.append('<a href="{}">{}</a>'.format(url, self.to_html(title)))
def get_first_url(): list_href = [] reaponse = requests.get(head_url, headers=headers, cookies=cookies) soup = Bs4(reaponse.text, "html.parser") urls_li = soup.select("#mainmenu_top > div > div > ul > li") for url_li in urls_li: urls = url_li.select("a") for url in urls: url_href = url.get("href") list_href.append(head_url + url_href) out_url = list(set(list_href)) return out_url
def parser_notice(self, html): soup = Bs4(html, 'html.parser') pre = soup.select_one('body > pre') html = pre.text json_val = json.loads(html) notices = json_val.get('results') for notice in notices: title = notice.get('title') url = 'https://coinone.co.kr/talk/notice/detail/{}'.format( notice.get('id')) self.current_notice_list.append('<a href="{}">{}</a>'.format( url, self.to_html(title)))
def html3_handle(html,courseid,coursename): soup = bs3.BeautifulSoup(html) soup1 = Bs4(html,'lxml') if len(soup1.findAll("img")): with open('未爬取科目.txt','w+') as f: str1 = courseid + "|" + coursename + "\r\n" f.write(str1) time.sleep(1) raise Exception("有图片,无法解析") for tag in soup.findAll('status'): tag.string = "1*" for tag in soup.findAll('count'): tag.string = "1*" for tag in soup.findAll('score'): tag.string = "1*" ali = soup.findAll(text=True) bli = [] for x in ali: if x.isspace(): pass else: x += '||' bli.append(x) astr = ' '.join(bli) bstr = p1.sub('', astr) cli = bstr.split("1*||") array1 = [] p = re.compile(r"<.*?>") p2 = re.compile(r"参见教材.*?。|参见教材P\d*|参考教材.*?。|参考教材P\d*") for x in cli: if x.isspace(): pass else: xli = x.split("||") dli = [] for x in xli: if len(x): a = p2.sub('', x) str1 = p.sub('', a).strip() if not str1.isspace(): dli.append(str1) if len(dli) > 3: array1.append(dli) print(array1) return array1
def get_more_urls(m_head_url, back_url): """获取所有更多的链接""" html = requests.get(m_head_url + back_url, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') a_list = soup.select('a') all_url_list = [] for a_tag in a_list: if str(a_tag).find("更多") != -1: url_href = a_tag.get('href') if url_href and url_href.startswith('/news'): print(m_head_url + url_href) all_url_list.append(m_head_url + url_href) return all_url_list
def get_first_url(): list_href = [] reaponse = requests.get(head_url, headers=headers) print(reaponse.text) soup = Bs4(reaponse.text, "lxml") urls_li = soup.select("#__next > div > div > header > div > div > div") print(urls_li) for url_li in urls_li: urls = url_li.select("a") for url in urls: url_href = url.get("href") list_href.append(head_url+url_href) out_url = list(set(list_href)) print(out_url) return out_url
def get_item(url): data_con = [] html = requests.get(url, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') ul_list = soup.select('ul') for ul in ul_list: ul_id = ul.get('id') if ul_id and ul_id == 'list': a_list = ul.select('a') for a_item in a_list: href = a_item.get('href') title = list(a_item.select('h2')[0].stripped_strings)[0] data_con.append([title, href]) return data_con
def get_li_a_link(): html = requests.get(head_url, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') ul_con = soup.select('ul') a_list = [] for ul in ul_con: if not ul.get('class'): continue if ul.get('class')[0] == 'menu_ul_left' or ul.get( 'class')[0] == 'menu_ul_right': a_s = ul.select('a') for a in a_s: a_list.append(a.get('href')) return a_list
def get_item(uri): """通过url读取数据页面的数据""" data_con = [] html = requests.get(uri, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') ul_list = soup.select('ul') for ul in ul_list: class_name = ul.get('class') if class_name and class_name[0].startswith('ul_w488'): a_list = ul.select('a') for a_item in a_list: href = head_url + '/news/' + a_item.get('href') title = a_item.string.strip() data_con.append([title, href]) return data_con
def get_other_page(sheet, page_url): my_file = open('data.txt', 'a') html = requests.get(page_url, headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') # 拿到所有div my_div_list = soup.select('div') for div in my_div_list: if div.get('class') and div.get('class')[0] == 'side_con_left': title = div.select('a')[0].get('title') href = head_url + div.select('a')[0].get('href') data = [title, href] global row row = row + 1 excel_write.write_excel(sheet, row, data) my_file.write(str(data) + '\n') my_file.close()
def main(options): # Handle DEBUG logger.info("Parsing Eng Phys Grad Students") if options.debug: logger.setLevel(logging.DEBUG) logger.debug("Debug Enabled") pkl_file = os.path.join(os.path.dirname(__file__), PKL_FILE) if not os.path.exists(pkl_file): logger.debug("Debug HTML file does not exist, parsing web site") r = requests.get(GRAD_LIST_URL) html_text = r.text with open(pkl_file, 'w') as f: f.write(html_text) else: logger.debug("Reading Debug HTML file") with open(pkl_file, 'r') as f: html_text = f.read() else: logger.info("Parsing Web Site") r = requests.get(GRAD_LIST_URL) html_text = r.text bs = Bs4(html_text) # Student table students = bs.find(id="bottom_content").table if options.file is not None: filename = options.file else: filename = DEFAULT_DATA_CSV logger.info("Saving student data to {}".format(filename)) with open(filename, 'w') as f: writer = csv.DictWriter( f, ["first", "last", "level", "email", "supervisor", "room", "ext"]) writer.writeheader() for row in students.find_all("tr")[1:]: data = strip_row(row) writer.writerow(data) logger.info("Finished parsing student data")
def get_first_url(): list_href = [] responses = requests.get(head_url) # print(responses.text) soup = Bs4(responses.text, "lxml") # print(soup) urls_li = soup.find_all(["a","br","tr"]) # urls_li = soup["href"] # print(urls_li) # urls_li = soup.select(selector = ".a") # print(urls_li) for url_li in urls_li: urls = url_li.select("a") # print(urls) for url in urls: url_href = url.get("href") a = list_href.append(head_url + url_href) # print(a) out_url = list(set(list_href)) # print(out_url) # print(out_url) return out_url
print("图片已存在") except: print("图片获取失败") def get_banner_page_link(): file = open('banner_link.txt', 'a') for i in range(1, 82): html = requests.get(head_url + '/page/' + str(i), headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') a_list = soup('a', attrs={'class': 'item-href'}) for a in a_list: print(a.get('href')) file.write(a.get('href') + '\n') if __name__ == '__main__': banner = open('banner.txt', 'a') banner_file = open('banner_link.txt') link_con = banner_file.readlines() for link in link_con: html = requests.get(link[0:-1], headers=head) html.encoding = 'utf-8' soup = Bs4(html.text, 'lxml') div = soup('div', attrs={'class': 'inspiration-images'})[0] image_links = div.select('img') for image_link in image_links: print(image_link.get('src')) save_image(image_link.get('src'))
os.mkdir(os.path.join(os.getcwd(), 'images')) except: pass os.chdir(os.path.join(os.getcwd(), 'images')) profile_url = 'https://www.instagram.com/pic.the.nature/' driver.get(profile_url) driver.implicitly_wait(10) scroll(driver, 4) source = driver.page_source html_soup = Bs4(source, 'lxml') html_soup.prettify() images = html_soup.find_all('img', {'class': 'FFVAD'}) for image in images: name = image['alt'][:10] link = image['src'] try: with open(name.replace('.', '-').replace('_', '-').replace(' ', '-').replace('/', '') + '.png', 'wb') as f: img = requests.get(link) f.write(img.content) except: print('fail')
base_google_url = "https://www.google.ca/search?q=" base_duck_duck_go = "https://duckduckgo.com/html?q=" terme = "pizza toute garnie" terme_converti_en_html = parse.quote_plus(terme) url_de_recherche_google = base_google_url + terme_converti_en_html url_de_recherche_duck_duck_go = base_duck_duck_go + terme_converti_en_html request_google = url.Request(url_de_recherche_google, None, headers) request_duck_duck_go = url.Request(url_de_recherche_duck_duck_go, None, headers) with url.urlopen(request_google) as response: html = response.read().decode('utf8') soup = Bs4(html, 'html.parser') g = soup.find_all("div", {"class": "g"}) h3_results = [] links_results = [] print(g) g_list = list(g) descriptions_list = [] for div_g in g: h3s = div_g.find_all("h3", {"class": "LC20lb"}) for h3 in h3s: a = h3.text h3_results.append(a) hrefs = div_g.find_all("a") c = [] for href in hrefs:
import requests from bs4 import BeautifulSoup as Bs4 from selenium import webdriver head = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' } head_url = 'https://www.lawtime.cn/dongchengqu/lawfirm/' if __name__ == '__main__': driver = webdriver.Chrome() file = open('dongcheng.txt', 'a') # 79 for i in range(1, 23): driver.get(head_url + 'p' + str(i) + '?order=1') soup = Bs4(driver.page_source, 'lxml') lay_info_div_con = soup('div', attrs={'class': 'law-info'}) for lay_info_div in lay_info_div_con: lay_name = lay_info_div.select('a')[0].text lay_phone = lay_info_div.select('a')[1].text people = lay_info_div.select('span')[0].text if 10 <= int(people) <= 50: file.write(lay_name + ' ' + lay_phone + ' ' + people + '人\n') print(lay_name, lay_phone, people) file.close()
def main(): url = 'http://ketqua.net/' r = requests.get(url) soup = Bs4(r.content, 'html.parser') target = soup.find('td', attrs={'class': 'bor f2 db'}) return target.get_text()
async def db_builder(host: str, database: str, table: str = "Articles", create_table=True, user: str = "postgres", passfile=None, password: str = None, ssl=False, port: int = None): """Builds an article database, with all articles to date.""" # Establishing DB Connection connection = await asyncpg.connect(host=host, port=port, user=user, password=password, passfile=passfile, database=database, ssl=ssl) # Make table if one is not provided if create_table: table = table.strip() await connection.execute(f""" CREATE TABLE "{table}" ( "ID" serial NOT NULL, "Title" text, "UID" text, "dateReleased" date, "dateAdded" date, "Text" text, PRIMARY KEY ("ID")); ALTER TABLE "{table}" OWNER to "{user}"; """) # Collecting Links and articles links = [] date_now = datetime.datetime.now().strftime("%Y-%m-%d") async with aiohttp.ClientSession() as session: async with session.get( "https://community.elitedangerous.com/#") as response: bs4 = Bs4(await response.text(), "html.parser") for entry in bs4.find_all( id="block-frontier-galnet-frontier-galnet-block-filter"): for link in entry.find_all("a"): links.append(link.get("href")) links.reverse() for result in links: date_article = datetime.datetime.strptime( result.replace("#", "")[re.search("^/galnet/", result).end():], "%d-%b-%Y") if date_article.year >= 3300: date_article = date_article.replace( year=(date_article.year - articlesearch.GAME_YEAR_OFFSET)) date_article = date_article.strftime("%Y-%m-%d") async with aiohttp.ClientSession() as session: async with session.get( f"https://community.elitedangerous.com{result}" ) as response: bs4 = Bs4(await response.text(), "html.parser") for entry in bs4.find_all("h3", {"class": "hiLite galnetNewsArticleTitle"}): entry_title = entry.get_text().strip().replace("'", "''") if entry_title == "" or entry_title is None: entry_title = "No Title Available" entry_uid = entry.find("a").get( "href")[re.search("^/galnet/uid/", entry.find("a").get("href")).end():] async with aiohttp.ClientSession() as session: async with session.get( f"https://community.elitedangerous.com/galnet/uid/{entry_uid}/" ) as response: bs4 = Bs4(await response.text(), "html.parser") text = unquote(bs4.find_all("p")[1].get_text().replace("'", "''")) await connection.execute( f""" INSERT INTO "{table}"("Title", "UID", "dateReleased", "dateAdded", "Text") VALUES($1, $2, $3, $4, $5);""", entry_title, entry_uid, date_article, date_now, text) await connection.close() # Dumping Settings For Future Use if os.path.exists("Settings.json"): os.remove("Settings.json") settings = await articlesearch.fetch_settings() settings["previous version"] = settings["version"] settings["host"] = host settings["database"] = database settings["table"] = table settings["user"] = user settings["passfile"] = passfile settings["password"] = password settings["ssl"] = ssl settings["port"] = port with open("Settings.json", "w+") as settings_file: json.dump(settings, settings_file, indent=2)
async def update(): """Looks for new articles.""" # Load Settings settings = await fetch_settings() table = settings["table"] async with aiohttp.ClientSession() as session: async with session.get( "https://community.elitedangerous.com/") as response: html = Bs4(await response.text(), "html.parser") connection = await connect() uids = [] new_articles = set() uid_records = await connection.fetch(f""" SELECT "UID" FROM "{table}" ORDER BY "dateReleased" DESC LIMIT 50; """) for record in uid_records: uids.append(record["UID"]) for entry in html.find_all("h3", {"class": "hiLite galnetNewsArticleTitle"}): entry = entry.find("a").get( "href")[re.search("^/galnet/uid/", entry.find("a").get("href")).end():] if entry not in uids: new_articles.add(entry) added = [] for article in new_articles: date_today = datetime.datetime.now() async with aiohttp.ClientSession() as session: async with session.get( f"https://community.elitedangerous.com/galnet/uid/{article}" ) as response: bs4 = Bs4(await response.text(), "html.parser") entry = bs4.find("h3", {"class": "hiLite galnetNewsArticleTitle"}) # Article Content entry_title = entry.get_text().strip().replace("'", "''") if entry_title == "" or entry_title is None: entry_title = "No Title Available" text = unquote(bs4.find_all("p")[1].get_text().replace("'", "''")) # Date info date_article = bs4.find("p").get_text() date_article = datetime.datetime.strptime(date_article, "%d %b %Y") if date_article.year >= 3300: date_article = date_article.replace(year=(date_article.year - GAME_YEAR_OFFSET)) added.append(article) await connection.execute( f""" INSERT INTO "{table}"("Title", "UID", "dateReleased", "dateAdded", "Text") VALUES ( $1, $2, $3, $4, $5); """, entry_title, article, date_article, date_today, text) await connection.close() if len(new_articles) > 0: return len(added), added
find_password_element.send_keys(Keys.ENTER) sleep(6) group_url = 'https://www.facebook.com/groups/playstoreappofficial/members' driver.get(group_url) driver.implicitly_wait(10) scroll(driver, 2) names = [] final_names = [] src = driver.page_source html_soup = Bs4(src, 'lxml') html_soup.prettify() for name in html_soup.find_all( 'a', { 'class': "oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p" }): text = name.get_text() list_0 = names.append(text) for final_name in names[1:]: final_names.append(final_name) df = pd.DataFrame(final_names)