def extract_info(soup): for link in soup.findAll("a"): if link.get("href") is None: continue if not link["href"].startswith(configs.web_path): continue print(link.get("href")) url = str(link["href"]) name = url[url.rindex("/"):] # name = name[:name.rindex('.')] with open("url_name.txt", "a+") as output: # This isn't really needed, but it's nice to have when debug is True if url not in output.read(): if configs.domain_included == True: output.write(url + ", " + name.strip("/") + "\n") elif configs.domain_included == False: output.write(configs.domain + url + ", " + name.strip("/") + "\n") print("Done") try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup) get_files(save_dir, configs.sleep_time, configs.debug)
name = link.string name = str(name) if "None" in name: try: name_table = [] for link_2 in soup.findAll("span"): if "hyperlink" in str(link_2.get("class")): name_table.append(link_2.string) name = name_table[0] # print(link) except KeyError: print("KeyError") pass # print("Else " + name ) # name = name[:name.rindex('.')] with open("url_name.txt", "a") as output: if "https" in link["href"]: output.write(url + ", " + name.strip("/") + ".pdf" + "\n") else: # Uncomment following line if domain is not in href, and comment out line above output.write(domain + url + ", " + name.strip("/") + ".pdf" + "\n") print("Done") try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup) get_files(save_dir, sleep_time)
import os from bs4 import BeautifulSoup import urllib import re import time import sys import configs from pathlib import Path p = Path(__file__).resolve().parents[3] sys.path.insert(1, str(p) + "/common") from bs_scrapers.get_files import get_files from bs_scrapers.extract_info import extract_info save_dir = "./data/" if not os.path.exists(save_dir): os.makedirs(save_dir) html_page = requests.get(configs.webpage).text soup = BeautifulSoup(html_page, "html.parser") url_name = [] try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup, configs) get_files(save_dir, configs.sleep_time)