def list_pdf_v3(configs, save_dir, debug=False, important=False, try_overwite=False, name_in_url=True, add_date=False, extract_name=False, no_overwrite=False): # try_overwite is for get_files if not os.path.exists(save_dir): os.makedirs(save_dir) html_page = requests.get(configs.webpage).text soup = BeautifulSoup(html_page, "html.parser") try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup, configs, extract_name=extract_name) if important == False: non_important = configs.non_important with open("url_name.txt", "r") as og_file, open("2url_name.txt", "w") as new_file: for line in og_file: if not any(non_important in line.lower() for non_important in non_important): new_file.write(line) else: try: important = configs.important except AttributeError: print("") print("Important is still named `non_important`") print("") important = configs.non_important with open("url_name.txt", "r") as og_file, open("2url_name.txt", "w") as new_file: for line in og_file: if any(important in line.lower() for important in important): new_file.write(line) print(line) if debug != True: try: os.remove("url_name.txt") except FileNotFoundError: pass os.rename("2url_name.txt", "url_name.txt") get_files( save_dir, configs.sleep_time, debug=debug, try_overwite=try_overwite, name_in_url=name_in_url, add_date=add_date, )
def list_pdf_v2( configs, save_dir, name_in_url=True, extract_name=False, add_date=False, try_overwite=False, no_overwrite=False, ): if not os.path.exists(save_dir): os.makedirs(save_dir) html_page = requests.get(configs.webpage).text soup = BeautifulSoup(html_page, "html.parser") url_name = [] try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup, configs, extract_name=extract_name) get_files(save_dir, configs.sleep_time, name_in_url=name_in_url, add_date=add_date)
soup = BeautifulSoup(html_page, "html.parser") # print(soup) url_name = [] def extract_info(soup): for link in soup.findAll("a"): if link.get("href") is None: continue if not link["href"].startswith(web_path): continue print(link.get("href")) url = str(link["href"]) name = url[url.rindex("/"):] # name = name[:name.rindex('.')] with open("url_name.txt", "a") as output: output.write(url + ", " + name.strip("/") + "\n") # Uncomment following line if domain is not in href, and comment out line above # output.write(domain + web_path + ", " + name.strip("/") + "\n") print("Done") try: os.remove("url_name.txt") except FileNotFoundError: pass extract_info(soup) get_files(save_dir, sleep_time) # import etl.py
dumped = json.dumps(parsed, indent=4, sort_keys=True) with open("response.json", "w") as output: output.write(dumped) output.close() try: os.remove("url_name.txt") except FileNotFoundError: pass with open("response.json", "r") as output: data = json.load(output) with open("url_name.txt", "w+") as outfile: for i in range(len(data) + 1): media_dict = data["media"][i] outfile.write( str(media_dict["frontend_url"]) + ", " + str(media_dict["name"]) + "\n") outfile.close() try: os.remove("response.json") except FileNotFoundError: pass get_files(save_dir, configs.sleep_time) # import etl.py