def getting_url(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--season", help="If you need to download only specific season", action="store") parser.add_argument("-l", "--link", help="The path of the series", action="store") args = parser.parse_args() r = requests.get(args.link) soup = beauty(r.content, 'html.parser') links = soup.find_all("a") urls = [] if args.season: season = 'S0' + args.season for i in range(len(links)): val = links[i]['href'] if '.mkv' and season in val: urls.append(args.link + val) else: for i in range(len(links)): val = links[i]['href'] if '.mkv' in val: urls.append(args.link + val) return urls
def parsetext(html_page): url = urllib2.urlopen(html_page) bs = beauty(url, "lxml") for link in bs.find_all('a'): print(link.get('href')) for img in bs.find_all('img'): print(img.get('src'))
def parse(self, current_html, current_url): # 非空判断 if current_html is None or current_url is None: return # "html.parser"为BS4所使用的具体解析器名称,可指定不同的解析器 soup = beauty(current_html,"html.parser") self.new_urls = self._get_new_urls(soup, current_url) self.new_datas = self._get_new_datas(soup, current_url) return self.new_datas,self.new_urls
import requests from bs4 import BeautifulSoup as beauty response = requests.get('https://datosmacro.expansion.com/deuda') html = response.text soup = beauty(html, 'html.parser') table_debt = soup.select('.table-responsive') table_debt = table_debt[0].find_all('tr') from csv import writer with open("table_debt.csv", "w") as file: csv_writer = writer(file, lineterminator ="\n") csv_writer.writerow(["Country","year", "Debt"]) for tr in table_debt[1:]: tds = tr.find_all('td') pais = tds[0].text[:-4] year = tds[1].text deuda = tds[2].text.replace('.', '') csv_writer.writerow([pais,year,deuda])
from bs4 import BeautifulSoup as beauty import requests source = requests.get('https://inshorts.com/en/read').text soup = beauty(source,'lxml') for news in soup.find_all('div',class_='news-card z-depth-1'): news_headline = news.find('div',class_='news-card-title news-right-box') news_headline = news_headline.a.span.text print(news_headline) news_content = news.find('div',class_='news-card-content news-right-box') news_content = news_content.div.text print(news_content) print()
link = str('http://' + domain.strip()) elif ("https://" in str( domain.strip())) or ("http://" in str( domain.strip())): link = str(domain.strip()) else: link = str('https://' + domain.strip()) #print(link) response = requests.get(link, headers=header, timeout=10, allow_redirects=False) status_code = response.status_code content_type = response.headers["Content-Type"] content_length = response.headers["Content-Length"] soup = beauty(response.content, 'html.parser') title = soup.find('title') try: if int(status_code) <= 300: print( link, "[\033[92m{}\033[0m]".format(int(status_code)), "[\033[93m{}\033[0m]".format( str(title.text.strip())), "[\033[34m{}\033[0m]".format( str(content_type)), "[\033[35m{}\033[0m]".format( int(content_length))) elif (int(status_code) == 301) or (int(status_code) == 302) or (int(status_code) == 307):