Beispiel #1
0
class WebCrawler:
    def __init__(self):
        self.url_util = UrlUtil()
        self.html_requester = HtmlRequester()
        self.html_parser = HtmlParser()

    def crawl(self, url):
        """
        Returns the URLs reachable from the parameter URL
        The assets of each URL are also returned.
        Only URLs with the same hostname including subdomain as the parameter URL are returned.
        """

        url = self.url_util.normalise_url(url)
        hostname = self.url_util.get_hostname(url)

        urlsToVisit = [url]
        urlsVisted = []
        output = []
        # Each iteration of this loop processes the next URL to visit.
        while (len(urlsToVisit) > 0):

            url = urlsToVisit.pop(0)
            urlsVisted.append(url)

            html = self.html_requester.get_html(url)
            links = self.html_parser.get_links(html)
            same_hostname_urls = self.html_parser.get_same_hostname_urls(
                hostname, links)
            assets = self.html_parser.get_assets(same_hostname_urls)
            web_pages = self.html_parser.get_web_pages(same_hostname_urls)

            output.append({"url": url, "assets": assets})
            print json.dumps({"url": url, "assets": assets}, indent=4)

            for web_page in web_pages:
                # Do not visit a page more than once
                if not web_page in urlsToVisit and web_page not in urlsVisted:
                    urlsToVisit.append(web_page)

        return json.dumps(output, indent=4).splitlines()
Beispiel #2
0
class HtmlParser:
    def __init__(self):
        self.url_util = UrlUtil()

    def get_links(self, html):
        """
        Extracts and returns the links in the parameter HTML
        """
        links = []
        link_prefixes = ["href=\"", "src=\""]
        for line in html:
            link_found = True
            # Each iteration of this loop finds a link in the line.
            while link_found:
                link_found = False
                start = len(line)
                end = len(line)
                # The next link in the line could have any link_prefix.
                for link_prefix in link_prefixes:
                    if link_prefix in line:
                        link_found = True
                        link_prefix_start = line.index(link_prefix) + len(
                            link_prefix)
                        link_prefix_end = line.index("\"", link_prefix_start)
                        # If the next link in the line prefixed by link_prefix is the closest to the start of line so far
                        if link_prefix_start < start:
                            start = link_prefix_start
                            end = link_prefix_end
                if link_found:
                    links.append(line[start:end])
                    line = line[end:]
        return links

    def get_same_hostname_urls(self, hostname, links):
        """
        Returns the links from the parameter links that have the same hostname as the parameter hostname
        The links returned are converted into absolute urls.
        """

        hostname = self.url_util.normalise_url(hostname)

        same_hostname_urls = []
        for link in links:

            # Normalise link
            if link.endswith("/"):
                link = link.rstrip("/")
            if link.startswith("https://"):
                link = link.lstrip("https://")
                link = "http://" + link
            if link.startswith("//"):
                link = "http:" + link

            if link.startswith(hostname):
                # Link starts with hostname.
                same_hostname_urls.append(link)
            elif not link.startswith("http://"):
                # Link is relative, so prefix the link with hostname
                if link != "" and not link.startswith("/"):
                    link = "/" + link
                same_hostname_urls.append(hostname + link)

        return same_hostname_urls

    def get_assets(self, urls):
        """
        Returns the urls from the parameter urls that refer to assets
        """
        assets = []
        asset_extensions = self.get_asset_extensions()
        for url in urls:
            for asset_extension in asset_extensions:
                if url.endswith(asset_extension):
                    # Assets should be unique.
                    if url not in assets:
                        assets.append(url)
                    break
        return assets

    def get_web_pages(self, urls):
        """
        Returns the urls from the parameter urls that refer to web pages
        """
        web_pages = []
        asset_extensions = self.get_asset_extensions()
        for url in urls:
            is_web_page = True
            for asset_extension in asset_extensions:
                if url.endswith(asset_extension):
                    is_web_page = False
                    break
            # Web pages should be unique.
            if is_web_page and url not in web_pages:
                web_pages.append(url)
        return web_pages

    def get_asset_extensions(self):
        """
        Returns file extensions for assets
        """
        asset_extensions = []
        file_path = os.path.join(os.path.dirname(__file__), 'resources',
                                 'asset_extensions.txt')
        with open(file_path, 'r') as file:
            for line in file:
                asset_extensions.append(line.rstrip('\n'))
        return asset_extensions