Ejemplo n.º 1
0
    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link
Ejemplo n.º 2
0
def main(initial_url):

    # List of 3-item tuples.
    # (file_path, encoding, base_url)
    # (dosya_yolu, kodlama, temel_url)
    to_be_processed = []
        
    queue = DownloadQueue()
    
    init_url = myurlparse(initial_url)

    if init_url.path  == "":
        initial_url += "/"
        init_url = myurlparse(initial_url)

    final_location = HTTPutils.getFinalUrl(init_url.geturl())

    if not final_location.startswith(initial_url):
        main_logger.critical("The page you have given redirects to %s")
        main_logger.critical("Aborting...")
    final_location = myurlparse(final_location)
    
    queue.append(final_location.getUrlWithoutFragments())
    
    download_dir = os.path.join(os.getcwd(), init_url.netloc).replace(".", "_")


    if not os.path.isdir(download_dir):
        os.mkdir(download_dir)

    def check_url(url,check_cache = {}):
        """
        Verilen url indirelecek mi diye bakar. Eğer, indirelecekse,
        gereken düzenlemeler de yapılmış olarak url döndürür, eğer
        indirilmeyecekse, None döndürür.
        ------------------------------------------------------------------
        Checks to see if url is ok for download
        """
        try:
            return check_cache[url]
        except KeyError:
            if not url.startswith(initial_url):
                check_cache[url] = None
                return None
            final_location = HTTPutils.getFinalUrl(url)

            if not final_location.startswith(initial_url):
                check_cache[url] = None
                return None
            new_link = myurlparse(final_location).getUrlWithoutFragments()
            check_cache[url] = new_link
            return new_link

    for link in queue:
        
        link = myurlparse(link)
        
        if link.netloc != init_url.netloc:
            main_logger.info("Skipping link from other internet location: %s" % link.geturl())
            continue
        
        content = HTTPutils.getContentType(link.geturl())
        if not content:
            main_logger.warning("Couldn\'t get content type for %s, skipping" % link.geturl())
            continue
        
        if content == "text/html" and not link.geturl().startswith(initial_url):
            main_logger.info("Skipping %s, because not in download subdirectory." % link.geturl())
            continue

        if content not in allowed_downloads:
            main_logger.info("Skipping %s because it is not in allowed downloads." % link.geturl())
            continue
        
        try:
            url = urlopen(link.geturl(), timeout=5)

        except HTTPError as e:
            main_logger.warning("Server couldn\'t fullfill the request. [%s], Skipping" % e.code)
            continue

        except URLError as e:
            main_logger.warning("We failed to reach %s because %s" % (link.geturl(), e.reason))
            main_logger.warning("Skipping %s" % link.geturl())
            continue
        
        main_logger.info("Downloading %s" % link.geturl())
        response = url.read()
        url.close()
        file_path = os.path.join(download_dir,*link.path.split("/"))

        #handle directories.
        if link.path.endswith("/"):
            file_path = os.path.join(file_path, "index.html")

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        with open(file_path, "w") as output_file:
            output_file.write(response)
            
        if content == "text/html":
            main_logger.info("Parsing page for further links, could take a while.")

            link_collect = LinkCollector()
            encoding = HTTPutils.getEncoding(link.geturl())
            if not encoding:
                main_logger.debug("Couldn\'t get encoding in http headers for %s" % link.geturl())
                # If http headers doesn't mention charset,
                # we parse html file to see meta headers
                a = encodingFinder()
                a.feed(response)
                encoding = a.encoding
            if not encoding:
                main_logger.debug("Set default encoding for %s" % link.geturl())
                encoding = "iso-8859-1"

            try:
                response_to_be_parsed = response.decode(encoding)
            except (LookupError, UnicodeDecodeError):
                main_logger.debug("Decoding failed for %s, feeding raw binary data" % link.geturl())
                response_to_be_parsed = response

            try:
                link_collect.feed(unicode(response, encoding))
            except HTMLParseError:
                main_logger.warning("HTML Parse error, could't get all the links.")

            for new_link in link_collect.links:
                new_link = check_url(urljoin(link.geturl(), new_link))
                if new_link:
                    queue.append(new_link)

            base_url = link.geturl()
            if base_url.endswith("/"):
                base_url += "index.html"
            to_be_processed.append((file_path, encoding, base_url))
            main_logger.info("Done parsing for links.")

    main_logger.info("Starting to fix references, this could take a while...")

    for file_path, encoding, url in to_be_processed:
        main_logger.info("processing %s" % file_path)
        with open(file_path, "r") as html_file:
            html_contents = html_file.read()

        a = HTMLReferenceFixer()
        a.setbaseurl(url)
        a.filepath = file_path

        try:
            a.feed(unicode(html_contents, encoding))
        except HTMLParseError:
            main_logger.warning("Couldn\'t parse %s, skipping" % (file_path))
            continue

        with open(file_path, "w") as processed_file:
            processed_file.write(a.output.encode(encoding))