예제 #1
0
def getting_url():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s",
                        "--season",
                        help="If you need to download only specific season",
                        action="store")
    parser.add_argument("-l",
                        "--link",
                        help="The path of the series",
                        action="store")
    args = parser.parse_args()
    r = requests.get(args.link)
    soup = beauty(r.content, 'html.parser')
    links = soup.find_all("a")
    urls = []
    if args.season:
        season = 'S0' + args.season
        for i in range(len(links)):
            val = links[i]['href']
            if '.mkv' and season in val:
                urls.append(args.link + val)
    else:
        for i in range(len(links)):
            val = links[i]['href']
            if '.mkv' in val:
                urls.append(args.link + val)
    return urls
예제 #2
0
def parsetext(html_page):
    url = urllib2.urlopen(html_page)
    bs = beauty(url, "lxml")
    for link in bs.find_all('a'):
        print(link.get('href'))

    for img in bs.find_all('img'):
        print(img.get('src'))
예제 #3
0
    def parse(self, current_html, current_url):
        # 非空判断
        if current_html is None or current_url is None:
            return

        # "html.parser"为BS4所使用的具体解析器名称,可指定不同的解析器
        soup = beauty(current_html,"html.parser")

        self.new_urls = self._get_new_urls(soup, current_url)
        self.new_datas = self._get_new_datas(soup, current_url)

        return self.new_datas,self.new_urls
예제 #4
0
import requests
from bs4 import BeautifulSoup as beauty

response = requests.get('https://datosmacro.expansion.com/deuda')
html = response.text
soup = beauty(html, 'html.parser')

table_debt = soup.select('.table-responsive')
table_debt = table_debt[0].find_all('tr')

from csv import writer

with open("table_debt.csv", "w") as file:
    csv_writer = writer(file, lineterminator ="\n")
    csv_writer.writerow(["Country","year", "Debt"])
    for tr in table_debt[1:]:
        tds = tr.find_all('td')
        pais = tds[0].text[:-4]
        year = tds[1].text
        deuda = tds[2].text.replace('.', '')
        csv_writer.writerow([pais,year,deuda])
예제 #5
0
from bs4 import BeautifulSoup as beauty
import requests

source = requests.get('https://inshorts.com/en/read').text
soup = beauty(source,'lxml')

for news in soup.find_all('div',class_='news-card z-depth-1'):
    news_headline = news.find('div',class_='news-card-title news-right-box')
    news_headline = news_headline.a.span.text
    print(news_headline)

    news_content = news.find('div',class_='news-card-content news-right-box')
    news_content = news_content.div.text
    print(news_content)

    print()


예제 #6
0
     link = str('http://' + domain.strip())
 elif ("https://" in str(
         domain.strip())) or ("http://" in str(
             domain.strip())):
     link = str(domain.strip())
 else:
     link = str('https://' + domain.strip())
 #print(link)
 response = requests.get(link,
                         headers=header,
                         timeout=10,
                         allow_redirects=False)
 status_code = response.status_code
 content_type = response.headers["Content-Type"]
 content_length = response.headers["Content-Length"]
 soup = beauty(response.content, 'html.parser')
 title = soup.find('title')
 try:
     if int(status_code) <= 300:
         print(
             link,
             "[\033[92m{}\033[0m]".format(int(status_code)),
             "[\033[93m{}\033[0m]".format(
                 str(title.text.strip())),
             "[\033[34m{}\033[0m]".format(
                 str(content_type)),
             "[\033[35m{}\033[0m]".format(
                 int(content_length)))
     elif (int(status_code)
           == 301) or (int(status_code)
                       == 302) or (int(status_code) == 307):