def crawl(links, pass_args) -> List[str]: """ Constructs the appropriate href and passes it to the scrapers for information :param links: The list of 'links' that are essentially elements with attributes according to the website :param pass_args: List of arguments based on the input :return: - Information collected by the scrapers concatenated together """ used_href = {} headers = [] for link in links: href = link.get("href") if not (href in used_href): used_href[href] = True headers.append(scraper.main([href] + pass_args).replace("\xa0", "") + " ") return headers
def main(args) -> List[str]: args = setup_arguments(args) html_code = requests.get(args.html_link) plain_text = html_code.text soup = BeautifulSoup(plain_text, 'lxml') used_href = {} headers = [] pass_args = [] if args.title: pass_args.append("-t") if args.body: pass_args.append("-b") for link in soup.findAll("a"): href = link.get("href") if not (href in used_href) and "/story/" in href: used_href[href] = True href1 = "https://www.wired.com" + href headers.append(scraper.main([href1] + pass_args).replace("\n", "").replace("| WIRED", "").replace("\xa0", "")) # print(headers) return headers
def main(args) -> List[str]: args = setup_arguments(args) html_code = requests.get(args.html_link) plain_text = html_code.text soup = BeautifulSoup(plain_text, 'html.parser') used_href = {} headers = [] pass_args = [] if args.title: pass_args.append("-t") if args.body: pass_args.append("-b") for link in soup.findAll("a", {"class": "white-ln"}): href = link.get("href") if not (href in used_href) and "https://abcnews.go.com/US" in href: used_href[href] = True headers.append( scraper.main([href] + pass_args).replace("\n", "").replace( "- ABC News", "").replace("\xa0", "")) # print(headers) return headers
def main(args) -> List[str]: args = setup_arguments(args) html_code = requests.get(args.html_link) plain_text = html_code.text soup = BeautifulSoup(plain_text, 'html.parser') used_href = {} headers = [] pass_args = [] if args.title: pass_args.append("-t") if args.body: pass_args.append("-b") for link in soup.findAll("h4"): href = link.find("a").get("href") if not (href in used_href): used_href[href] = True href1 = "https://thehill.com" + href headers.append( scraper.main([href1] + pass_args).replace("\n", "").replace( "| TheHill", "").replace("\xa0", "")) # print(headers) return headers