def __init__(self, cfg_file_path, json_file_path, site_index, shall_resume, daemonize, library_mode=False): # set up logging before it's defined via the config file, # this will be overwritten and all other levels will be put out # as well, if it will be changed. configure_logging({"LOG_LEVEL": "CRITICAL"}) self.log = logging.getLogger(__name__) self.cfg_file_path = cfg_file_path self.json_file_path = json_file_path self.site_number = int(site_index) self.shall_resume = shall_resume \ if isinstance(shall_resume, bool) else literal_eval(shall_resume) self.daemonize = daemonize \ if isinstance(daemonize, bool) else literal_eval(daemonize) # set up the config file self.cfg = CrawlerConfig.get_instance() self.cfg.setup(self.cfg_file_path) self.log.debug("Config initialized - Further initialisation.") self.cfg_crawler = self.cfg.section("Crawler") # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information ( # kind of hacky..) if not library_mode: self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) sites = self.json.get_site_objects() site = sites[self.site_number] else: sites = [json_file_path] site = json_file_path if "ignore_regex" in site: ignore_regex = "(%s)" % site["ignore_regex"] else: ignore_regex = "(%s)" % \ self.cfg.section('Crawler')['ignore_regex'] # Get the default crawler. The crawler can be overwritten by fallbacks. if "additional_rss_daemon" in site and self.daemonize: self.crawler_name = "RssCrawler" elif "crawler" in site: self.crawler_name = site["crawler"] else: self.crawler_name = self.cfg.section("Crawler")["default"] # Get the real crawler-class (already "fallen back") crawler_class = self.get_crawler(self.crawler_name, site["url"]) if not self.cfg.section('Files')['relative_to_start_processes_file']: relative_to_path = os.path.dirname(self.cfg_file_path) else: # absolute dir this script is in relative_to_path = os.path.dirname(__file__) self.helper = Helper(self.cfg.section('Heuristics'), self.cfg.section("Files")["local_data_directory"], relative_to_path, self.cfg.section('Files')['format_relative_path'], sites, crawler_class, self.cfg.get_working_path()) self.__scrapy_options = self.cfg.get_scrapy_options() self.update_jobdir(site) # make sure the crawler does not resume crawling # if not stated otherwise in the arguments passed to this script self.remove_jobdir_if_not_resume() self.load_crawler(crawler_class, site["url"], ignore_regex) # start the job. if in library_mode, do not stop the reactor and so on after this job has finished # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but # the code continues to run. we catch this excepion in the function 'start_process'. if library_mode: start_new_thread(start_process, (self.process, False,)) else: self.process.start()
def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql, is_no_confirm, library_mode=False): """ The constructor of the main class, thus the real entry point to the tool. :param cfg_file_path: :param is_resume: :param is_reset_elasticsearch: :param is_reset_json: :param is_reset_mysql: :param is_no_confirm: """ configure_logging({"LOG_LEVEL": "ERROR"}) self.log = logging.getLogger(__name__) # other parameters self.shall_resume = is_resume self.no_confirm = is_no_confirm self.library_mode = library_mode # Sets an environmental variable called 'CColon', so scripts can import # modules of this project in relation to this script's dir # example: sitemap_crawler can import UrlExtractor via # from newsplease.helper_classderes.url_extractor import UrlExtractor os.environ['CColon'] = os.path.abspath(os.path.dirname(__file__)) # set stop handlers self.set_stop_handler() # threading self.thread_event = threading.Event() # Get & set CFG and JSON locally. if cfg_directory_path: # if a path was given by the user self.cfg_directory_path = self.get_expanded_path( cfg_directory_path) else: # if no path was given by the user, use default self.cfg_directory_path = self.get_expanded_path( self.config_directory_default_path) # init cfg path if empty self.init_config_file_path_if_empty() self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name # config self.cfg = CrawlerConfig.get_instance() self.cfg.setup(self.cfg_file_path) self.mysql = self.cfg.section("MySQL") self.elasticsearch = self.cfg.section("Elasticsearch") # perform reset if given as parameter if is_reset_mysql: self.reset_mysql() if is_reset_json: self.reset_files() if is_reset_elasticsearch: self.reset_elasticsearch() # close the process if is_reset_elasticsearch or is_reset_json or is_reset_mysql: sys.exit(0) self.json_file_path = self.cfg_directory_path + self.cfg.section( 'Files')['url_input_file_name'] self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) self.crawler_list = self.CrawlerList() self.daemon_list = self.DaemonList() self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False) self.manage_crawlers()
def url_to_request_with_agent(url): options = CrawlerConfig.get_instance().get_scrapy_options() user_agent = options['USER_AGENT'] return urllib2.Request(url, headers={'user-agent': user_agent})
def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql, is_no_confirm, library_mode=False): """ The constructor of the main class, thus the real entry point to the tool. :param cfg_file_path: :param is_resume: :param is_reset_elasticsearch: :param is_reset_json: :param is_reset_mysql: :param is_no_confirm: """ configure_logging({"LOG_LEVEL": "ERROR"}) self.log = logging.getLogger(__name__) # other parameters self.shall_resume = is_resume self.no_confirm = is_no_confirm self.library_mode = library_mode # Sets an environmental variable called 'CColon', so scripts can import # modules of this project in relation to this script's dir # example: sitemap_crawler can import UrlExtractor via # from newsplease.helper_classderes.url_extractor import UrlExtractor os.environ['CColon'] = os.path.abspath(os.path.dirname(__file__)) # set stop handlers self.set_stop_handler() # threading self.thread_event = threading.Event() # Get & set CFG and JSON locally. if cfg_directory_path: # if a path was given by the user self.cfg_directory_path = self.get_expanded_path(cfg_directory_path) else: # if no path was given by the user, use default self.cfg_directory_path = self.get_expanded_path(self.config_directory_default_path) # init cfg path if empty self.init_config_file_path_if_empty() self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name # config self.cfg = CrawlerConfig.get_instance() self.cfg.setup(self.cfg_file_path) self.mysql = self.cfg.section("MySQL") self.elasticsearch = self.cfg.section("Elasticsearch") # perform reset if given as parameter if is_reset_mysql: self.reset_mysql() if is_reset_json: self.reset_files() if is_reset_elasticsearch: self.reset_elasticsearch() # close the process if is_reset_elasticsearch or is_reset_json or is_reset_mysql: sys.exit(0) self.json_file_path = self.cfg_directory_path + self.cfg.section('Files')['url_input_file_name'] self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) self.crawler_list = self.CrawlerList() self.daemon_list = self.DaemonList() self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False) self.manage_crawlers()
def __init__( self, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql, is_no_confirm, ): """ The constructor of the main class, thus the real entry point to the tool. :param is_resume: :param is_reset_elasticsearch: :param is_reset_json: :param is_reset_mysql: :param is_no_confirm: """ # other parameters self.shall_resume = is_resume self.no_confirm = is_no_confirm # set stop handlers self.set_stop_handler() # threading self.thread_event = threading.Event() self.cfg_directory_path = self.get_expanded_path( self.config_directory_default_path) # config self.cfg = CrawlerConfig.get_instance() self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name self.cfg.setup(self.cfg_file_path) self.elasticsearch = self.cfg.section("Elasticsearch") self.mysql = self.cfg.section("MySQL") # perform reset if given as parameter if is_reset_mysql: self.reset_mysql() if is_reset_json: self.reset_files() if is_reset_elasticsearch: self.reset_elasticsearch() # close the process if is_reset_elasticsearch or is_reset_json or is_reset_mysql: sys.exit(0) self.json_file_path = ( self.cfg_directory_path + self.cfg.section("Files")["url_input_file_name"]) self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) self.crawler_list = self.CrawlerList() self.daemon_list = self.DaemonList() self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False) self.manage_crawlers()
def __init__(self, cfg_file_path, json_file_path, site_index, shall_resume, daemonize, library_mode=False): # set up logging before it's defined via the config file, # this will be overwritten and all other levels will be put out # as well, if it will be changed. configure_logging({"LOG_LEVEL": "CRITICAL"}) self.log = logging.getLogger(__name__) self.cfg_file_path = cfg_file_path self.json_file_path = json_file_path self.site_number = int(site_index) self.shall_resume = shall_resume \ if isinstance(shall_resume, bool) else literal_eval(shall_resume) self.daemonize = daemonize \ if isinstance(daemonize, bool) else literal_eval(daemonize) # set up the config file self.cfg = CrawlerConfig.get_instance() self.cfg.setup(self.cfg_file_path) self.log.debug("Config initialized - Further initialisation.") self.cfg_crawler = self.cfg.section("Crawler") # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information ( # kind of hacky..) if not library_mode: self.json = JsonConfig.get_instance() self.json.setup(self.json_file_path) sites = self.json.get_site_objects() site = sites[self.site_number] else: sites = [json_file_path] site = json_file_path if "ignore_regex" in site: ignore_regex = "(%s)|" % site["ignore_regex"] else: ignore_regex = "(%s)|" % \ self.cfg.section('Crawler')['ignore_regex'] # Get the default crawler. The crawler can be overwritten by fallbacks. if "additional_rss_daemon" in site and self.daemonize: self.crawler_name = "RssCrawler" elif "crawler" in site: self.crawler_name = site["crawler"] else: self.crawler_name = self.cfg.section("Crawler")["default"] # Get the real crawler-class (already "fallen back") crawler_class = self.get_crawler(self.crawler_name, site["url"]) if not self.cfg.section('Files')['relative_to_start_processes_file']: relative_to_path = os.path.dirname(self.cfg_file_path) else: # absolute dir this script is in relative_to_path = os.path.dirname(__file__) self.helper = Helper(self.cfg.section('Heuristics'), self.cfg.section("Files")["local_data_directory"], relative_to_path, self.cfg.section('Files')['format_relative_path'], sites, crawler_class, self.cfg.get_working_path()) self.__scrapy_options = self.cfg.get_scrapy_options() self.update_jobdir(site) # make sure the crawler does not resume crawling # if not stated otherwise in the arguments passed to this script self.remove_jobdir_if_not_resume() self.load_crawler(crawler_class, site["url"], ignore_regex) # start the job. if in library_mode, do not stop the reactor and so on after this job has finished # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but # the code continues to run. we catch this excepion in the function 'start_process'. if library_mode: start_new_thread(start_process, (self.process, False,)) else: self.process.start()