def download_imgs(): with open(os.path.join(COMMON_FLAGS.json_dir, 'selected_keywords.json'), 'r') as fp: keywords = json.load(fp) print("keywords:", type(keywords), len(keywords), type(keywords[0])) api_keys = { 'flickr': ('3845aa5608781b176e74bedd2a653b78', '19192eb5251a4809') } # replace XXX.. and YYY.. by your own keys # images_nbr = 10000 # number of images to fetch images_nbr = 200 # 200 * 200 = 40k ### Crawl and download images ### from web_crawler import WebCrawler crawler = WebCrawler(api_keys, mindate=mindate, maxdate=maxdate) # 1. Crawl the web and collect URLs: crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True) # 2. (alernative to the previous line) Load URLs from a file instead of the web: #crawler.load_urls(download_folder + "/links.txt") #crawler.load_urls_from_json(download_folder + "/links.json") # 3. Save URLs to download them later (optional): # crawler.save_urls(os.path.join(download_folder, "links.txt")) crawler.save_urls_to_json( os.path.join(url_folder, "links-%s-%s.json" % (mindate, maxdate)))
def build(cls): command_print("Build started") cls.crawler = WebCrawler() # run method for index cls.crawler.scrape_index_pages() # run method for all country pages cls.crawler.scrape_country_pages() # run method for all continent pages cls.crawler.scrape_continent_pages() # create the index from memory cls.crawler.create_index_file() command_print("Build completed")
def test_crawl(self): """ Tests crawl method The get_html method of the html_requester class is mocked to return the contents of html_test_data.html. This mocking allows for inputting test html data without having to host it online. """ file_util = FileUtil() expected_result = file_util.get_file_contents("crawl_test_data.txt") web_crawler = WebCrawler() web_crawler.html_requester.get_html = lambda url: self.mock_get_html( url) actual_result = web_crawler.crawl("http://www.domain.com") self.assertEqual(expected_result, actual_result)
def __init__(self, target): super().__init__() self.crawler = WebCrawler() target_config = getConfig().get("targets", {}).get(target) self.logger = getLogger(self.__class__.__name__) if not target_config: self.logger.error("target is not found in config.") raise Exception("target is not found in config.") self.logger.info(f"Application is processing target {target}") self.target_config = target_config self.max_threads = int(getConfig()["configs"]["max_threads"]) self.sleep_time = int(self.target_config["sleep"]) self.detail_urls = [] self.items = []
def main(args): logging.basicConfig(filename='web_crawler_' + str(datetime.now()) + '.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) try: web_crawler_instance = WebCrawler(args[1]) web_crawler_instance.crawl() except IndexError as ex: logging.error( f"An error has occurred whilst running the crawler, no URL was provided: {str(ex)}" ) except DBConnectionError as ex: logging.error( f"An error has ocurred whilst connecting to DB: {str(ex)}") except Exception as ex: logging.error( f"An error has occurred whilst running the crawler: {str(ex)}") logging.info("Program finished running.")
def test_crawler(self): web_crawler = WebCrawler(url="http://localhost:5000", max_threads=32, max_pages=float("inf")) self.assertEqual( sorted(list(web_crawler.crawl())), sorted([ "http://localhost:5000/com", "http://localhost:5000/test", "http://localhost:5000/test123", ]), ) self.assertNotEqual( sorted(list(web_crawler.crawl())), sorted([ "https://google.com", "http://localhost:5000/com", "/test", "/test123" ]), )
def main(): parser = ArgumentParser() group = parser.add_mutually_exclusive_group() group.add_argument("-d", "--depth", type=int, help="limit crawling by depth of directory tree (default, 10)") group.add_argument("-c", "--count", type=int, help="limit crawling by number of pages") parser.add_argument("url_list", help="file containing urls separated by newlines") parser.add_argument("-v", "--verbose", action="store_true", help="set verbosity of program") parser.add_argument("-p", "--max-processes", type=int, help="maximum number of processes to run in parallel (default is 10)") parser.add_argument("-t", "--max-threads", type=int, help="maximum number of threads per process (default is 20)") args = parser.parse_args() # check if url_list file exists and that user has permission to read it if not os.path.isfile(args.url_list) or not os.access(args.url_list, os.R_OK): print("[-] File does not exist: {}".format(args.url_list)) sys.exit(1) # get url list urls = list() with open(args.url_list, "r") as url_list_file: for url in url_list_file: urls.append(url.strip()) crawler = WebCrawler(urls) # set custom parameters if args.max_processes: crawler.max_processes = args.max_processes if args.max_threads: crawler.max_threads = args.max_threads if args.verbose: crawler.verbose = True if args.depth: crawler.limit = "depth" crawler.limit_param = args.depth elif args.count: crawler.limit = "count" crawler.limit_param = args.count crawler.start() sys.exit(0)
def web_crawler_main(): """ check user input and start WebCrawler """ opts, args = get_args() logger = get_logger() url = add_valid_protocol_prefix(opts.url) depth_limit = opts.depth_limit if 0 < opts.depth_limit <= DEFAULT__DEPTH_LIMIT else None time_out = opts.time_out if 0 < opts.time_out else None if not url or not depth_limit or not time_out: if not url: logger.error("invalid page address") if not depth_limit: logger.error("invalid depth limit") if not time_out: logger.error("invalid time out") raise SystemExit(1) domain_name = get_sub_domain_name(url) web_crawler = WebCrawler(url, domain_name, depth_limit, time_out, logger) web_crawler.start()
def __init__(self): super().__init__() self.logger = getLogger(self.__class__.__name__) self.reviewUrl = getConfig()['task1']['review_url'] self.web_crawler = WebCrawler()
def __init__(self): self.logger = getLogger(self.__class__.__name__) self.commentUrl = getConfig()['task2']['comment_url'] self.web_crawler = WebCrawler() self.db_helper = DbHelper() self.__comments = []
def __init__(self): self.web_crawler = WebCrawler()
def __init__(self, base_url): super().__init__() self.logger = getLogger(self.__class__.__name__) self.crawler = WebCrawler() self.base_url = base_url
"""on importe notre module web crawler qu'on a créé """ import sys from web_crawler import WebCrawler DATA_TYPE = sys.argv[1] CRAWLING_ACTIVATED = True if sys.argv[2] == '1' else False # On prend en entrée un URL if DATA_TYPE == '1': print("Starting crawler on URL") if not CRAWLING_ACTIVATED: print("(Crawling deactivated)") STARTING_URL = sys.argv[3] CRAWLER = WebCrawler() CRAWLER.crawl_site(STARTING_URL, CRAWLING_ACTIVATED) CRAWLER.print_report() # On prend en entrée un fichier local (crawling désactivé) elif DATA_TYPE == '2': print("Starting crawler on local file") print("(Crawling deactivated)") CRAWLING_ACTIVATED = False LOCAL_FILE = sys.argv[3] CRAWLER = WebCrawler() CRAWLER.crawl_local_file(LOCAL_FILE) CRAWLER.print_report() # On prend en entrée des données en stdin elif DATA_TYPE == '3': print("What type of std:in do you want to use \
) parser.add_argument( "--max_pages", help="limit the maximum pages that the crawler can parse") parser.add_argument("-v", "--verbose", help="increase the verbose output", type=bool, default=False) args = parser.parse_args() return { "url": args.url if args.url else "https://www.scrapehero.com/", "max_threads": args.max_threads if args.max_threads else min( 32, os.cpu_count() + 4), # is the max workers default value in python "max_pages": args.max_pages if args.max_pages else float("inf"), "verbosity": args.verbose, } if __name__ == "__main__": args = initialise_arguments() web_crawler = WebCrawler(**args) pprint(web_crawler.crawl())
#!/usr/bin/env python keywords = ["cats", "dogs", "birds"] api_keys = { 'google': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY'), 'flickr': ('XXXXXXXXXXXXXXXXXXXXXXXX', 'YYYYYYYYY') } images_nbr = 10 # number of images to fetch download_folder = "./data" # folder in which the images will be stored ### Crawl and download images ### from web_crawler import WebCrawler crawler = WebCrawler(api_keys) # 1. Crawl the web and collect URLs: crawler.collect_links_from_web(keywords, images_nbr, remove_duplicated_links=True) # 2. (alernative to the previous line) Load URLs from a file instead of the web: #crawler.load_urls(download_folder + "/links.txt") #crawler.load_urls_from_json(download_folder + "/links.json") # 3. Save URLs to download them later (optional): crawler.save_urls(download_folder + "/links.txt") #crawler.save_urls_to_json(download_folder + "/links.json") # 4. Download the images: crawler.download_images(target_folder=download_folder)
from web_crawler import WebCrawler from worker import Worker config = ConfigParser.RawConfigParser() config.read('/etc/calfresh/calfresh.conf') logger = logging.getLogger('root') if __name__ == '__main__': logger.info('starting...') datapath = None for table in table_url_map.keys(): try: crawler = WebCrawler(table, table_url_map[table]) new_table_data = crawler.crawl() if new_table_data: worker = Worker(new_table_data) datapath = worker.work() except Exception as ex: logger.exception(ex) if datapath: loader = DataLoader() loader.load(datapath) crawler.clean_up() logger.info('finished')
def __init__(self): self._crawler = WebCrawler() self._parser = None
def setUp(self): self.crawler = WebCrawler( table='tbl_dfa256', url=table_url_map['tbl_dfa256'], )
def main(): crawl = WebCrawler(input()) crawl.spider()
def __init__(self): self.inverted_index = InvertedIndex() self.crawler = WebCrawler()