Ejemplo n.º 1
0
def download_resource(url):
    try:
        # logger.debug("Downloading :" + url)
        exist_filename = fs.get_exist_file_name(url)
        if (exist_filename):
            logger.warn("file is existed: " + exist_filename)
            return exist_filename
        content = http.get_blob(url)
        out_path = fs.save_blob(url, content)
        logger.info('RESOURCE: %s => %s' % (url, out_path))
        Db.add_done_item(url)
        return out_path
    except Exception as err:
        logger.error('error download resource %r' % (err))
Ejemplo n.º 2
0
def run_in_threads(data, action, thread_count):
    results = {}
    if not data or len(data) == 0:
        return results

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=thread_count) as executor:
        # Start the load operations and mark each future with its URL
        future_to_url = {executor.submit(action, url): url for url in data}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            result = ''
            try:
                result = future.result()
                results[url] = result.replace(args.output, '')
            except Exception as exc:
                logger.error('%r generated an exception: %s' % (url, exc))
                Db.add_error_link(url, str(exc))
            else:
                logger.info('FECHED: %s => %s' % (url, result))
                Db.add_done_item(url)
    return results
Ejemplo n.º 3
0
def crawl_page(url):
    global visited_links
    global downloaded_links
    global html_queue
    global resource_queue
    try:
        html = http.get_html(url)
        html_links = HtmlParser.get_links(args.url, html)
        resource_links = HtmlParser.get_resource_urls(args.url, html)
        lock.acquire()
        unique_links = list(set(html_links) - set(visited_links))
        visited_links = visited_links + unique_links
        Db.add_links(unique_links)
        for l in unique_links:
            html_queue.tasks.crawl_page(l)
        lock.release()
        lock2.acquire()
        unique_resource_links = list(
            set(resource_links) - set(downloaded_links))
        downloaded_links = downloaded_links + unique_resource_links
        lock2.release()
        if args.download_resources == True:
            resources = dict([(resource_url,
                               fs.get_filename_from_url(resource_url))
                              for resource_url in resource_links])
            html = HtmlParser.replace_resource_url(resources, html)

            for resource_link in unique_resource_links:
                resource_queue.tasks.download_resource(resource_link)

        output_path = fs.save_html(url, html)
        logger.info('HTML : %s -> %s' % (url, output_path))
        Db.add_done_item(url)
        return output_path
    except Exception as err:
        logger.error(err)