Exemple #1
0
def extract_info(soup):
    for link in soup.findAll("a"):
        if link.get("href") is None:
            continue
        if not link["href"].startswith(configs.web_path):
            continue
        print(link.get("href"))
        url = str(link["href"])
        name = url[url.rindex("/"):]
        # name = name[:name.rindex('.')]

        with open("url_name.txt", "a+") as output:
            # This isn't really needed, but it's nice to have when debug is True
            if url not in output.read():
                if configs.domain_included == True:
                    output.write(url + ", " + name.strip("/") + "\n")
                elif configs.domain_included == False:
                    output.write(configs.domain + url + ", " +
                                 name.strip("/") + "\n")
    print("Done")


try:
    os.remove("url_name.txt")
except FileNotFoundError:
    pass

extract_info(soup)
get_files(save_dir, configs.sleep_time, configs.debug)
Exemple #2
0
        name = link.string
        name = str(name)
        if "None" in name:
            try:
                name_table = []
                for link_2 in soup.findAll("span"):
                    if "hyperlink" in str(link_2.get("class")):
                        name_table.append(link_2.string)
                name = name_table[0]
                # print(link)
            except KeyError:
                print("KeyError")
                pass
            # print("Else " + name )
        # name = name[:name.rindex('.')]
        with open("url_name.txt", "a") as output:
            if "https" in link["href"]:
                output.write(url + ", " + name.strip("/") + ".pdf" + "\n")
            else:
                # Uncomment following line if domain is not in href, and comment out line above
                output.write(domain + url + ", " + name.strip("/") + ".pdf" + "\n")
    print("Done")


try:
    os.remove("url_name.txt")
except FileNotFoundError:
    pass
extract_info(soup)
get_files(save_dir, sleep_time)
import os
from bs4 import BeautifulSoup
import urllib
import re
import time
import sys
import configs
from pathlib import Path

p = Path(__file__).resolve().parents[3]
sys.path.insert(1, str(p) + "/common")
from bs_scrapers.get_files import get_files
from bs_scrapers.extract_info import extract_info

save_dir = "./data/"

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

html_page = requests.get(configs.webpage).text
soup = BeautifulSoup(html_page, "html.parser")

url_name = []

try:
    os.remove("url_name.txt")
except FileNotFoundError:
    pass
extract_info(soup, configs)
get_files(save_dir, configs.sleep_time)