def download_resource(url): try: # logger.debug("Downloading :" + url) exist_filename = fs.get_exist_file_name(url) if (exist_filename): logger.warn("file is existed: " + exist_filename) return exist_filename content = http.get_blob(url) out_path = fs.save_blob(url, content) logger.info('RESOURCE: %s => %s' % (url, out_path)) Db.add_done_item(url) return out_path except Exception as err: logger.error('error download resource %r' % (err))
def run_in_threads(data, action, thread_count): results = {} if not data or len(data) == 0: return results with concurrent.futures.ThreadPoolExecutor( max_workers=thread_count) as executor: # Start the load operations and mark each future with its URL future_to_url = {executor.submit(action, url): url for url in data} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] result = '' try: result = future.result() results[url] = result.replace(args.output, '') except Exception as exc: logger.error('%r generated an exception: %s' % (url, exc)) Db.add_error_link(url, str(exc)) else: logger.info('FECHED: %s => %s' % (url, result)) Db.add_done_item(url) return results
def crawl_page(url): global visited_links global downloaded_links global html_queue global resource_queue try: html = http.get_html(url) html_links = HtmlParser.get_links(args.url, html) resource_links = HtmlParser.get_resource_urls(args.url, html) lock.acquire() unique_links = list(set(html_links) - set(visited_links)) visited_links = visited_links + unique_links Db.add_links(unique_links) for l in unique_links: html_queue.tasks.crawl_page(l) lock.release() lock2.acquire() unique_resource_links = list( set(resource_links) - set(downloaded_links)) downloaded_links = downloaded_links + unique_resource_links lock2.release() if args.download_resources == True: resources = dict([(resource_url, fs.get_filename_from_url(resource_url)) for resource_url in resource_links]) html = HtmlParser.replace_resource_url(resources, html) for resource_link in unique_resource_links: resource_queue.tasks.download_resource(resource_link) output_path = fs.save_html(url, html) logger.info('HTML : %s -> %s' % (url, output_path)) Db.add_done_item(url) return output_path except Exception as err: logger.error(err)