Beispiel #1
0
def crawl(links, pass_args) -> List[str]:
    """
    Constructs the appropriate href and passes it to the scrapers for information
    :param links: The list of 'links' that are essentially elements with attributes according to the website
    :param pass_args: List of arguments based on the input
    :return: - Information collected by the scrapers concatenated together
    """
    used_href = {}
    headers = []
    for link in links:
        href = link.get("href")
        if not (href in used_href):
            used_href[href] = True
            headers.append(scraper.main([href] + pass_args).replace("\xa0", "") + " ")
    return headers
Beispiel #2
0
def main(args) -> List[str]:
    args = setup_arguments(args)
    html_code = requests.get(args.html_link)
    plain_text = html_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    used_href = {}
    headers = []
    pass_args = []
    if args.title:
        pass_args.append("-t")
    if args.body:
        pass_args.append("-b")
    for link in soup.findAll("a"):
        href = link.get("href")
        if not (href in used_href) and "/story/" in href:
            used_href[href] = True
            href1 = "https://www.wired.com" + href
            headers.append(scraper.main([href1] + pass_args).replace("\n", "").replace("| WIRED", "").replace("\xa0", ""))
    # print(headers)
    return headers
Beispiel #3
0
def main(args) -> List[str]:
    args = setup_arguments(args)
    html_code = requests.get(args.html_link)
    plain_text = html_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    used_href = {}
    headers = []
    pass_args = []
    if args.title:
        pass_args.append("-t")
    if args.body:
        pass_args.append("-b")
    for link in soup.findAll("a", {"class": "white-ln"}):
        href = link.get("href")
        if not (href in used_href) and "https://abcnews.go.com/US" in href:
            used_href[href] = True
            headers.append(
                scraper.main([href] + pass_args).replace("\n", "").replace(
                    "- ABC News", "").replace("\xa0", ""))
    # print(headers)
    return headers
Beispiel #4
0
def main(args) -> List[str]:
    args = setup_arguments(args)
    html_code = requests.get(args.html_link)
    plain_text = html_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')
    used_href = {}
    headers = []
    pass_args = []
    if args.title:
        pass_args.append("-t")
    if args.body:
        pass_args.append("-b")
    for link in soup.findAll("h4"):
        href = link.find("a").get("href")
        if not (href in used_href):
            used_href[href] = True
            href1 = "https://thehill.com" + href
            headers.append(
                scraper.main([href1] + pass_args).replace("\n", "").replace(
                    "| TheHill", "").replace("\xa0", ""))
    # print(headers)
    return headers