def _compile_regex(self, yamlconfig, re_module): patterns = [] # compile all search patterns strict = yamlconfig.get('strict_regex', False) regexes = yamlconfig['search'] logger.debug("compiling {} regexes ...".format(len(regexes))) for regex in regexes: try: search = regex['search'] ps = PastieSearch(re_module, regex) patterns.append(ps) except KeyError: if strict: raise PystemonConfigException("Missing search pattern") else: logger.error("Error: skipping empty search pattern entry") except Exception as e: if strict: raise PystemonConfigException( "Unable to parse regex '%s': %s" % (search, e)) else: logger.error("Error: Unable to parse regex '%s': %s" % (search, e)) logger.debug("successfully compiled {0}/{1} regexes".format( len(patterns), len(regexes))) return patterns
def _load_regex_engine(self, yamlconfig): # load the regular expression engine engine = yamlconfig.get('engine', 're') re_module = None if not engine in ['re', 'regex']: raise PystemonConfigException("only 're' or 'regex' supported, not '{0}'".format(engine)) try: logger.debug("Loading regular expression engine '{0}'".format(engine)) re_module=importlib.import_module(engine) if engine == 'regex': logger.debug("Setting regex DEFAULT_VERSION to VERSION1") re_module.DEFAULT_VERSION = re_module.VERSION1 except ImportError as e: raise PystemonConfigException("unable to import module '{0}'".format(engine)) return re_module
def _load_storage_engines(self, yamlconfig): # initialize storage backends storage_engines = [] storage_yamlconfig = yamlconfig.get('storage', {}) save_dir = None archive_dir = None storage_file = None compress = False # file storage is the default and should be initialized first to set save_dir and archive_dir try: storage_file = PastieStorage.load_storage( 'archive', **storage_yamlconfig.pop('archive')) if storage_file is not None: save_dir = storage_file.save_dir archive_dir = storage_file.archive_dir compress = storage_file.compress storage_engines.append(storage_file) except KeyError as e: raise PystemonConfigException( 'archive was not found under storage, old pystemon.yaml config?' ) for storage in storage_yamlconfig.keys(): engine = PastieStorage.load_storage(storage, save_dir=save_dir, archive_dir=archive_dir, **storage_yamlconfig[storage]) if engine is not None: storage_engines.append(engine) return { 'save_dir': save_dir, 'archive_dir': archive_dir, 'compress': compress, 'engines': storage_engines }
def reload(self): try: with self.lock: if self._reload_count: logger.debug("reloading configuration file '{0}'".format(self._configfile)) self._yamlconfig = None else: logger.debug("loading configuration file '{0}'".format(self._configfile)) self._reload_count = self._reload_count + 1 self._preload() config = self._reload() self._ip_addr = config.get('ip_addr') self._sendmail = config.get('sendmail') self._save_thread = config.get('save_thread') self._user_agents_list = config.get('user_agents_list') self._storage_engines = config.get('storage_engines') self._save_dir = config.get('save_dir') self._archive_dir = config.get('archive_dir') self._compress = config.get('compress') self._proxies_list = config.get('proxies_list') self._re_module = config.get('re_module') self._patterns = config.get('patterns') self._sites = config.get('sites') self._threads = config.get('threads') self._pidfile = config.get('pidfile') self._max_throttling = 0 for site in self._sites: if self._max_throttling < site.throttling: self._max_throttling = site.throttling except PystemonConfigException: raise except Exception as e: raise PystemonConfigException('Unable to parse configuration: {}'.format(e)) logger.debug("configuration loaded") return True
def _reload(self): logger.debug("parsing yaml configuration from file '{}'".format( self._configfile)) config = {} yamlconfig = self._yamlconfig try: if yamlconfig['proxy']['random']: config['proxies_list'] = ProxyList(yamlconfig['proxy']['file']) except KeyError: pass config['save_thread'] = yamlconfig.get('save-thread', False) uaconfig = yamlconfig.get('user-agent', {}) if uaconfig.get('random', False): try: config['user_agents_list'] = self._load_user_agents_from_file( yamlconfig['user-agent']['file']) except KeyError: raise PystemonConfigException( 'random user-agent requested but no file provided') try: ip_addr = yamlconfig['network']['ip'] except KeyError: logger.debug("Using default IP address") pass config['sendmail'] = self._load_email(yamlconfig) res = self._load_storage_engines(yamlconfig) config['storage_engines'] = res['engines'] config['save_dir'] = res['save_dir'] config['archive_dir'] = res['archive_dir'] config['compress'] = res['compress'] config['re_module'] = self._load_regex_engine(yamlconfig) config['patterns'] = self._compile_regex(yamlconfig, config['re_module']) try: config['threads'] = int(yamlconfig.get('threads', 1)) if config['threads'] < 1: raise Exception("minimum acceptable value is 1") except Exception as e: logger.error("invalid threads value specified: {0}".format(e)) config['threads'] = 1 pass config['sites'] = self._load_sites(yamlconfig) if not self.debug and 'logging-level' in yamlconfig: if yamlconfig['logging-level'] in [ 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' ]: logger.setLevel( logging.getLevelName(yamlconfig['logging-level'])) else: logger.error("logging level \"%s\" is invalid" % (yamlconfig['logging-level'])) logger.debug("yaml configuration parsed") return config
def _load_user_agents_from_file(self, filename): user_agents_list = [] logger.debug('Loading user-agent from file "{file}" ...'.format(file=filename)) with open(filename) as f: for line in f: line = line.strip() if line: user_agents_list.append(line) if not len(user_agents_list) > 0: raise PystemonConfigException("found zero valid UserAgents") logger.debug("Found {count} UserAgents in file '{file}'".format(file=filename, count=len(user_agents_list))) return user_agents_list
def _load_yamlconfig(self, configfile): yamlconfig = None try: if self._recent_pyyaml(): # https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation # only for 5.1+ yamlconfig = yaml.load(open(configfile), Loader=yaml.SafeLoader) else: yamlconfig = yaml.safe_load(open(configfile)) except yaml.YAMLError as exc: logger.error("Error in configuration file {0}:".format(configfile)) if hasattr(exc, 'problem_mark'): mark = exc.problem_mark raise PystemonConfigException("error position: (%s:%s)" % (mark.line + 1, mark.column + 1)) for includes in yamlconfig.get("includes", []): try: logger.debug("loading include '{0}'".format(includes)) yamlconfig.update(yaml.safe_load(open(includes))) except Exception as e: raise PystemonConfigException("failed to load '{0}': {1}".format(includes, e)) return yamlconfig