Ejemplo n.º 1
0
    def parse(self, response):
        url = response.url
        self.visited_links.append(url)
        print(
            magenta(
                f"{get_time()} ({len(self.visited_links)}) Scraping {response.url}"
            ))

        page_links_items = scrape_links(html=response.body, url=url)

        # Shuffle the links to improve variance.
        random.shuffle(page_links_items)

        # Save all the links in the database.
        for page_link_item in page_links_items:
            yield page_link_item

        print(
            f"({get_domain(url)} - {len(self.visited_links)}) {len(page_links_items)} links saved."
        )

        # Analyze each link found in the page.
        for (i, page_link_item) in enumerate(page_links_items):
            link_url = add_scheme(page_link_item["link_url"])

            # If the link has not been visited yet, visit it.
            if link_url not in self.visited_links and self.allowed_domains[
                    0] in link_url:
                yield response.follow(link_url, callback=self.parse)
Ejemplo n.º 2
0
def analyze_page(url):
    # Check in the database if the web page has already been visited.
    get_new_page = False
    result = db_get_page(url=url)

    # Delete, if present, an obsolete version of the page.
    if result is not None and not helper.is_action_recent(
            timestamp=result[6], days=0, minutes=1):
        db_delete_page(url=url)
        db_delete_text_links(url=url)
        get_new_page = True

    # If the page is not present in memory.
    if result is None:
        get_new_page = True

    if get_new_page:
        # Save the info in the DB.
        url = add_scheme(url)
        simple_html = get_simple_html(url=url)
        db_insert_page(url=url, simple_html=simple_html)
        db_add_topic_to_page(url=url, topic=get_topic(url))
        db_add_language_to_page(url=url, language=get_language(url))
        # Finish analyse (with rendering) the web page in the background.
        threading.Thread(target=finish_page_analysis, args=(url, )).start()
Ejemplo n.º 3
0
def finish_page_analysis(url):
    url = add_scheme(url)
    print(f"{get_time()} [{url}] PARSED HTML code started.")
    parsed_html = scrape_page(url)
    db_add_parsed_html_to_page(url=url, parsed_html=parsed_html)
    # Extract clean main text.
    extract_main_text(url)
    print(f"{get_time()} [{url}] PARSED HTML code finished.")
Ejemplo n.º 4
0
def scrape_links(html, url):
    # Extract all links from the page.
    links = BeautifulSoup(html, "lxml").find_all("a")
    links = list(filter(lambda x: x.get("href") is not None, links))

    page_links_items = []
    for i, link in enumerate(links):
        try:
            href = add_scheme(urljoin(url, link.get("href")))
            text = strip_html_tags(link.text)
            # True if the element is contained in a list container.
            try:
                in_list = "li" in [parent.name for parent in links[i].parents]
            except IndexError:
                in_list = False

            # True if the element is contained in a nav container.
            try:
                in_nav = "nav" in [parent.name for parent in links[i].parents]
            except IndexError:
                in_nav = False

            # Skip PDF files.
            if href[-3:] in ["pdf", "jpg", "png"]:
                continue

            # If the link links to the same page, discard it.
            hash_position = href.find("/#")
            if href[:hash_position] == add_scheme(url):
                continue

            # Link is okay, send it to pipeline.
            page_link_item = PageLinkItem()
            page_link_item["link_url"] = href
            page_link_item["link_text"] = text
            page_link_item["page_url"] = url
            page_link_item["in_list"] = in_list
            page_link_item["in_nav"] = in_nav

            # We save the link in the DB only if it belongs to the domain.
            if get_domain(url) in href:
                page_links_items.append(page_link_item)
        except StaleElementReferenceException:
            continue

    return page_links_items
Ejemplo n.º 5
0
    def run(self):
        # Insert domain into the website database.
        db_insert_website(domain=self.start_url)

        dir_path = pathlib.Path(__file__).parent.absolute()
        # Change working directory to the folder of this file.
        os.chdir(dir_path)
        # Open a shell in the scrapy directory and start crawling in a new subprocess.
        path = str(Path(os.getcwd()))
        homepage_url = add_scheme(f"{self.start_url}")
        command = f"scrapy crawl links -s url={homepage_url}"
        subprocess.call(command, cwd=path, shell=True)
Ejemplo n.º 6
0
def get_simple_html(url):
    """
    This method requests the HTML code of the web page.
    For speed purposes, Javascript is not supported.
    """
    url = add_scheme(url)
    print(f"{get_time()} [{url}] SIMPLE HTML code started.")
    try:
        html = requests.get(url).text
    except Exception:
        raise PageRequestError
    print(f"{get_time()} [{url}] SIMPLE HTML code finished.")
    return html
Ejemplo n.º 7
0
 def homepage(self):
     self.navigation.url = add_scheme(get_domain(self.navigation.url))
     text_response = self.visit_page()
     return text_response
Ejemplo n.º 8
0
def scrape_page(url):
    """
    Given a URL, it adds in the database all the links contained in that web page.
    :param url: A string containing the URL of the web page to analyse.
    :return: None.
    """
    print(f"{get_time()} [SELENIUM] Page rendering started.")
    browser = webdriver.Firefox(executable_path=f"{dir_path}/geckodriver.exe",
                                options=get_firefox_options(),
                                firefox_profile=get_firefox_profile(),
                                service_log_path=os.devnull)
    try:
        browser.get(url)
        body = browser.page_source

        links = browser.find_elements(By.XPATH, '//a[@href]')

        links_bs4 = BeautifulSoup(body, "lxml").find_all("a")
        links_bs4 = list(filter(lambda x: x.get("href") is not None,
                                links_bs4))

        # Delete all the old crawler links of the page.
        db_delete_all_page_links(url=url)

        for i, link in enumerate(links):
            try:
                href = add_scheme(link.get_attribute("href"))
                text = strip_html_tags(link.get_attribute("innerHTML"))
                x_position = str(link.location.get('x'))
                y_position = str(link.location.get('y'))
                # True if the element is contained in a list container.
                parents = [parent.name for parent in links_bs4[i].parents]
                in_list = int("li" in parents)
                in_nav = int("nav" in parents)

                # Skip PDF files.
                if href[-3:] in ["pdf", "jpg", "png"]:
                    continue

                # If the link links to the same page, discard it.
                hash_position = href.find("/#")
                if href[:hash_position] == url or len(text) == 0:
                    continue

            except StaleElementReferenceException:
                continue
            # Update link in database.
            db_insert_page_link(page_url=url,
                                link_url=href,
                                link_text=text,
                                x_position=x_position,
                                y_position=y_position,
                                in_list=in_list,
                                in_nav=in_nav)

    except Exception as e:
        print(red(f"[SELENIUM] Can't access this website: {url}"))
        print(e)
        body = "<html></html>"

    print(f"{get_time()} [SELENIUM] Page rendering finished.")
    browser.quit()
    return body