Beispiel #1
0
    def __init__(self, cfg_file_path, json_file_path,
                 site_index, shall_resume, daemonize, library_mode=False):
        # set up logging before it's defined via the config file,
        # this will be overwritten and all other levels will be put out
        # as well, if it will be changed.
        configure_logging({"LOG_LEVEL": "CRITICAL"})
        self.log = logging.getLogger(__name__)

        self.cfg_file_path = cfg_file_path
        self.json_file_path = json_file_path
        self.site_number = int(site_index)
        self.shall_resume = shall_resume \
            if isinstance(shall_resume, bool) else literal_eval(shall_resume)
        self.daemonize = daemonize \
            if isinstance(daemonize, bool) else literal_eval(daemonize)

        # set up the config file
        self.cfg = CrawlerConfig.get_instance()
        self.cfg.setup(self.cfg_file_path)
        self.log.debug("Config initialized - Further initialisation.")

        self.cfg_crawler = self.cfg.section("Crawler")

        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
        # kind of hacky..)
        if not library_mode:
            self.json = JsonConfig.get_instance()
            self.json.setup(self.json_file_path)
            sites = self.json.get_site_objects()
            site = sites[self.site_number]
        else:
            sites = [json_file_path]
            site = json_file_path

        if "ignore_regex" in site:
            ignore_regex = "(%s)" % site["ignore_regex"]
        else:
            ignore_regex = "(%s)" % \
                self.cfg.section('Crawler')['ignore_regex']

        # Get the default crawler. The crawler can be overwritten by fallbacks.
        if "additional_rss_daemon" in site and self.daemonize:
            self.crawler_name = "RssCrawler"
        elif "crawler" in site:
            self.crawler_name = site["crawler"]
        else:
            self.crawler_name = self.cfg.section("Crawler")["default"]
        # Get the real crawler-class (already "fallen back")
        crawler_class = self.get_crawler(self.crawler_name, site["url"])

        if not self.cfg.section('Files')['relative_to_start_processes_file']:
            relative_to_path = os.path.dirname(self.cfg_file_path)
        else:
            # absolute dir this script is in
            relative_to_path = os.path.dirname(__file__)

        self.helper = Helper(self.cfg.section('Heuristics'),
                             self.cfg.section("Files")["local_data_directory"],
                             relative_to_path,
                             self.cfg.section('Files')['format_relative_path'],
                             sites,
                             crawler_class,
                             self.cfg.get_working_path())

        self.__scrapy_options = self.cfg.get_scrapy_options()

        self.update_jobdir(site)

        # make sure the crawler does not resume crawling
        # if not stated otherwise in the arguments passed to this script
        self.remove_jobdir_if_not_resume()

        self.load_crawler(crawler_class,
                          site["url"],
                          ignore_regex)

        # start the job. if in library_mode, do not stop the reactor and so on after this job has finished
        # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems
        # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but
        # the code continues to run. we catch this excepion in the function 'start_process'.
        if library_mode:
            start_new_thread(start_process, (self.process, False,))
        else:
            self.process.start()
    def __init__(self,
                 cfg_directory_path,
                 is_resume,
                 is_reset_elasticsearch,
                 is_reset_json,
                 is_reset_mysql,
                 is_no_confirm,
                 library_mode=False):
        """
        The constructor of the main class, thus the real entry point to the tool.
        :param cfg_file_path:
        :param is_resume:
        :param is_reset_elasticsearch:
        :param is_reset_json:
        :param is_reset_mysql:
        :param is_no_confirm:
        """
        configure_logging({"LOG_LEVEL": "ERROR"})
        self.log = logging.getLogger(__name__)

        # other parameters
        self.shall_resume = is_resume
        self.no_confirm = is_no_confirm
        self.library_mode = library_mode

        # Sets an environmental variable called 'CColon', so scripts can import
        # modules of this project in relation to this script's dir
        # example: sitemap_crawler can import UrlExtractor via
        #   from newsplease.helper_classderes.url_extractor import UrlExtractor
        os.environ['CColon'] = os.path.abspath(os.path.dirname(__file__))

        # set stop handlers
        self.set_stop_handler()

        # threading
        self.thread_event = threading.Event()

        # Get & set CFG and JSON locally.
        if cfg_directory_path:
            # if a path was given by the user
            self.cfg_directory_path = self.get_expanded_path(
                cfg_directory_path)
        else:
            # if no path was given by the user, use default
            self.cfg_directory_path = self.get_expanded_path(
                self.config_directory_default_path)
        # init cfg path if empty
        self.init_config_file_path_if_empty()
        self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name

        # config
        self.cfg = CrawlerConfig.get_instance()
        self.cfg.setup(self.cfg_file_path)
        self.mysql = self.cfg.section("MySQL")
        self.elasticsearch = self.cfg.section("Elasticsearch")

        # perform reset if given as parameter
        if is_reset_mysql:
            self.reset_mysql()
        if is_reset_json:
            self.reset_files()
        if is_reset_elasticsearch:
            self.reset_elasticsearch()
        # close the process
        if is_reset_elasticsearch or is_reset_json or is_reset_mysql:
            sys.exit(0)

        self.json_file_path = self.cfg_directory_path + self.cfg.section(
            'Files')['url_input_file_name']

        self.json = JsonConfig.get_instance()
        self.json.setup(self.json_file_path)

        self.crawler_list = self.CrawlerList()
        self.daemon_list = self.DaemonList()

        self.__single_crawler = self.get_abs_file_path("./single_crawler.py",
                                                       True, False)

        self.manage_crawlers()
Beispiel #3
0
 def url_to_request_with_agent(url):
     options = CrawlerConfig.get_instance().get_scrapy_options()
     user_agent = options['USER_AGENT']
     return urllib2.Request(url, headers={'user-agent': user_agent})
    def __init__(self, cfg_directory_path, is_resume, is_reset_elasticsearch, is_reset_json, is_reset_mysql,
                 is_no_confirm, library_mode=False):
        """
        The constructor of the main class, thus the real entry point to the tool.
        :param cfg_file_path:
        :param is_resume:
        :param is_reset_elasticsearch:
        :param is_reset_json:
        :param is_reset_mysql:
        :param is_no_confirm:
        """
        configure_logging({"LOG_LEVEL": "ERROR"})
        self.log = logging.getLogger(__name__)

        # other parameters
        self.shall_resume = is_resume
        self.no_confirm = is_no_confirm
        self.library_mode = library_mode

        # Sets an environmental variable called 'CColon', so scripts can import
        # modules of this project in relation to this script's dir
        # example: sitemap_crawler can import UrlExtractor via
        #   from newsplease.helper_classderes.url_extractor import UrlExtractor
        os.environ['CColon'] = os.path.abspath(os.path.dirname(__file__))

        # set stop handlers
        self.set_stop_handler()

        # threading
        self.thread_event = threading.Event()

        # Get & set CFG and JSON locally.
        if cfg_directory_path:
            # if a path was given by the user
            self.cfg_directory_path = self.get_expanded_path(cfg_directory_path)
        else:
            # if no path was given by the user, use default
            self.cfg_directory_path = self.get_expanded_path(self.config_directory_default_path)
        # init cfg path if empty
        self.init_config_file_path_if_empty()
        self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name

        # config
        self.cfg = CrawlerConfig.get_instance()
        self.cfg.setup(self.cfg_file_path)
        self.mysql = self.cfg.section("MySQL")
        self.elasticsearch = self.cfg.section("Elasticsearch")

        # perform reset if given as parameter
        if is_reset_mysql:
            self.reset_mysql()
        if is_reset_json:
            self.reset_files()
        if is_reset_elasticsearch:
            self.reset_elasticsearch()
        # close the process
        if is_reset_elasticsearch or is_reset_json or is_reset_mysql:
            sys.exit(0)

        self.json_file_path = self.cfg_directory_path + self.cfg.section('Files')['url_input_file_name']

        self.json = JsonConfig.get_instance()
        self.json.setup(self.json_file_path)

        self.crawler_list = self.CrawlerList()
        self.daemon_list = self.DaemonList()

        self.__single_crawler = self.get_abs_file_path("./single_crawler.py", True, False)

        self.manage_crawlers()
Beispiel #5
0
    def __init__(
        self,
        is_resume,
        is_reset_elasticsearch,
        is_reset_json,
        is_reset_mysql,
        is_no_confirm,
    ):
        """
        The constructor of the main class, thus the real entry point to the tool.
        :param is_resume:
        :param is_reset_elasticsearch:
        :param is_reset_json:
        :param is_reset_mysql:
        :param is_no_confirm:
        """

        # other parameters
        self.shall_resume = is_resume
        self.no_confirm = is_no_confirm

        # set stop handlers
        self.set_stop_handler()

        # threading
        self.thread_event = threading.Event()

        self.cfg_directory_path = self.get_expanded_path(
            self.config_directory_default_path)
        # config
        self.cfg = CrawlerConfig.get_instance()
        self.cfg_file_path = self.cfg_directory_path + self.config_file_default_name
        self.cfg.setup(self.cfg_file_path)
        self.elasticsearch = self.cfg.section("Elasticsearch")
        self.mysql = self.cfg.section("MySQL")

        # perform reset if given as parameter
        if is_reset_mysql:
            self.reset_mysql()
        if is_reset_json:
            self.reset_files()
        if is_reset_elasticsearch:
            self.reset_elasticsearch()
        # close the process
        if is_reset_elasticsearch or is_reset_json or is_reset_mysql:
            sys.exit(0)

        self.json_file_path = (
            self.cfg_directory_path +
            self.cfg.section("Files")["url_input_file_name"])

        self.json = JsonConfig.get_instance()
        self.json.setup(self.json_file_path)

        self.crawler_list = self.CrawlerList()
        self.daemon_list = self.DaemonList()

        self.__single_crawler = self.get_abs_file_path("./single_crawler.py",
                                                       True, False)

        self.manage_crawlers()
    def __init__(self, cfg_file_path, json_file_path,
                 site_index, shall_resume, daemonize, library_mode=False):
        # set up logging before it's defined via the config file,
        # this will be overwritten and all other levels will be put out
        # as well, if it will be changed.
        configure_logging({"LOG_LEVEL": "CRITICAL"})
        self.log = logging.getLogger(__name__)

        self.cfg_file_path = cfg_file_path
        self.json_file_path = json_file_path
        self.site_number = int(site_index)
        self.shall_resume = shall_resume \
            if isinstance(shall_resume, bool) else literal_eval(shall_resume)
        self.daemonize = daemonize \
            if isinstance(daemonize, bool) else literal_eval(daemonize)

        # set up the config file
        self.cfg = CrawlerConfig.get_instance()
        self.cfg.setup(self.cfg_file_path)
        self.log.debug("Config initialized - Further initialisation.")

        self.cfg_crawler = self.cfg.section("Crawler")

        # load the URL-input-json-file or - if in library mode - take the json_file_path as the site information (
        # kind of hacky..)
        if not library_mode:
            self.json = JsonConfig.get_instance()
            self.json.setup(self.json_file_path)
            sites = self.json.get_site_objects()
            site = sites[self.site_number]
        else:
            sites = [json_file_path]
            site = json_file_path

        if "ignore_regex" in site:
            ignore_regex = "(%s)|" % site["ignore_regex"]
        else:
            ignore_regex = "(%s)|" % \
                           self.cfg.section('Crawler')['ignore_regex']

        # Get the default crawler. The crawler can be overwritten by fallbacks.
        if "additional_rss_daemon" in site and self.daemonize:
            self.crawler_name = "RssCrawler"
        elif "crawler" in site:
            self.crawler_name = site["crawler"]
        else:
            self.crawler_name = self.cfg.section("Crawler")["default"]
        # Get the real crawler-class (already "fallen back")
        crawler_class = self.get_crawler(self.crawler_name, site["url"])

        if not self.cfg.section('Files')['relative_to_start_processes_file']:
            relative_to_path = os.path.dirname(self.cfg_file_path)
        else:
            # absolute dir this script is in
            relative_to_path = os.path.dirname(__file__)

        self.helper = Helper(self.cfg.section('Heuristics'),
                             self.cfg.section("Files")["local_data_directory"],
                             relative_to_path,
                             self.cfg.section('Files')['format_relative_path'],
                             sites,
                             crawler_class,
                             self.cfg.get_working_path())

        self.__scrapy_options = self.cfg.get_scrapy_options()

        self.update_jobdir(site)

        # make sure the crawler does not resume crawling
        # if not stated otherwise in the arguments passed to this script
        self.remove_jobdir_if_not_resume()

        self.load_crawler(crawler_class,
                          site["url"],
                          ignore_regex)

        # start the job. if in library_mode, do not stop the reactor and so on after this job has finished
        # so that further jobs can be executed. it also needs to run in a thread since the reactor.run method seems
        # to not return. also, scrapy will attempt to start a new reactor, which fails with an exception, but
        # the code continues to run. we catch this excepion in the function 'start_process'.
        if library_mode:
            start_new_thread(start_process, (self.process, False,))
        else:
            self.process.start()