def init(self): self.columns = self.parameters.columns # convert columns to an array if type(self.columns) is str: self.columns = [ column.strip() for column in self.columns.split(",") ] self.type_translation = json.loads( getattr(self.parameters, 'type_translation', None) or '{}') self.data_type = json.loads( getattr(self.parameters, 'data_type', None) or '{}') # prevents empty strings: self.column_regex_search = getattr(self.parameters, 'column_regex_search', None) or {} self.time_format = getattr(self.parameters, "time_format", None) if self.time_format not in TIME_CONVERSIONS.keys(): raise InvalidArgument('time_format', got=self.time_format, expected=list(TIME_CONVERSIONS.keys()), docs='docs/Bots.md') self.filter_text = getattr(self.parameters, 'filter_text', None) self.filter_type = getattr(self.parameters, 'filter_type', None) if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'): raise InvalidArgument('filter_type', got=self.filter_type, expected=("blacklist", "whitelist"), docs='docs/Bots.md')
def init(self): self.columns = self.parameters.columns # convert columns to an array if type(self.columns) is str: self.columns = [column.strip() for column in self.columns.split(",")] self.type_translation = getattr(self.parameters, 'type_translation', {}) if self.type_translation and isinstance(self.type_translation, str): # not-empty string self.type_translation = json.loads(self.type_translation) elif not self.type_translation: # empty string self.type_translation = {} self.data_type = json.loads(getattr(self.parameters, 'data_type', None) or '{}') # prevents empty strings: self.column_regex_search = getattr(self.parameters, 'column_regex_search', None) or {} self.time_format = getattr(self.parameters, "time_format", None) if self.time_format not in TIME_CONVERSIONS.keys(): raise InvalidArgument('time_format', got=self.time_format, expected=list(TIME_CONVERSIONS.keys()), docs='docs/Bots.md') self.filter_text = getattr(self.parameters, 'filter_text', None) self.filter_type = getattr(self.parameters, 'filter_type', None) if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'): raise InvalidArgument('filter_type', got=self.filter_type, expected=("blacklist", "whitelist"), docs='docs/Bots.md') self.columns_required = getattr(self.parameters, 'columns_required', [True for _ in self.columns]) if len(self.columns) != len(self.columns_required): raise ValueError("Length of parameters 'columns' (%d) and 'columns_required' (%d) " "needs to be equal." % (len(self.columns), len(self.columns_required)))
def init(self): super().init() if self.repository is not None: self.__base_api_url = 'https://api.github.com/repos/{}/contents'.format( self.repository) else: raise InvalidArgument('repository', expected='string') if self.regex is not None: try: re.compile(self.regex) except Exception: raise InvalidArgument('regex', expected='string', got=self.regex) else: raise InvalidArgument('regex', expected='string', got=None) if self.extra_fields is not None: try: self.__extra_fields = [ x.strip() for x in self.extra_fields.split(',') ] except Exception: raise InvalidArgument('extra_fields', expected='comma-separated list') else: self.__extra_fields = []
def init(self): super().init() if hasattr(self.parameters, 'repository'): self.__base_api_url = 'https://api.github.com/repos/{}/contents'.format( getattr(self.parameters, 'repository')) if hasattr(self.parameters, 'regex'): try: re.compile(getattr(self.parameters, 'regex')) except Exception: raise InvalidArgument('regex', expected='string', got=getattr(self.parameters, 'regex')) else: raise InvalidArgument('regex', expected='string', got=None) if not hasattr(self.parameters, 'repository'): raise InvalidArgument('repository', expected='string') if hasattr(self.parameters, 'extra_fields'): try: self.__extra_fields = [ x.strip() for x in getattr(self.parameters, 'extra_fields').split(',') ] except Exception: raise InvalidArgument('extra_fields', expected='comma-separated list') else: self.__extra_fields = []
def init(self): # although True is the default value, we leave False here for backwards compatibility self.fallback_to_url = getattr(self.parameters, 'fallback_to_url', False) ignore = getattr(self.parameters, 'gaierrors_to_ignore', ()) if not ignore: # for null/None/empty lists or strings ignore = () elif not isinstance(ignore, (list, tuple)): ignore = ignore.split(',') # otherwise a string ignore = tuple(x.strip() for x in ignore) # check if every element is an integer: for x in ignore: try: int(x) except TypeError: raise InvalidArgument(argument='gaierrors_to_ignore', got=x, expected='int', docs='the bot documentation.') ignore = tuple(int(x) for x in ignore) # convert to integers self.ignore = (-2, -4, -5, -8, -11) + ignore self.overwrite = getattr(self.parameters, 'overwrite', False)
def init(self): if url_normalize is None: raise MissingDependencyError("url-normalize") url_version = pkg_resources.get_distribution("url-normalize").version if tuple(int(v) for v in url_version.split('.')) < ( 1, 4, 1) and self.default_scheme is not None: raise ValueError( "Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. " "Get at least version '1.4.1'." % url_version) if get_tld is None: raise MissingDependencyError("tld") try: update_tld_names() except tld.exceptions.TldIOError: self.logger.info("Could not update TLD names cache.") if self.domain_whitelist != '': self._domain_whitelist.extend(self.domain_whitelist.split(',')) if self.substitutions != '': temp = self.substitutions.split(';') if len(temp) % 2 != 0: raise InvalidArgument( 'substitutions', got=self.substitutions, expected="even number of ; separated strings") for i in range(int(len(temp) / 2)): self._substitutions.append([temp[2 * i], temp[2 * i + 1]]) if not ClassificationType.is_valid(self.classification_type): self.classification_type = 'unknown' if self.default_scheme is not None: self.url_kwargs = {'default_scheme': self.default_scheme} else: self.url_kwargs = {}
def init(self): if bs is None: raise ValueError("Could not import 'beautifulsoup4'. Please install it.") self.columns = self.parameters.columns # convert columns to an array if type(self.columns) is str: self.columns = [column.strip() for column in self.columns.split(",")] self.ignore_values = getattr(self.parameters, "ignore_values", len(self.columns) * ['']) if type(self.ignore_values) is str: self.ignore_values = [value.strip() for value in self.ignore_values.split(",")] if len(self.columns) != len(self.ignore_values): raise ValueError("Length of parameters 'columns' and 'ignore_values' is not equal.") self.table_index = getattr(self.parameters, "table_index", 0) self.attr_name = getattr(self.parameters, "attribute_name", None) self.attr_value = getattr(self.parameters, "attribute_value", None) self.skip_head = getattr(self.parameters, "skip_table_head", True) self.skip_row = 1 if self.skip_head else 0 self.split_column = getattr(self.parameters, "split_column", None) self.split_separator = getattr(self.parameters, "split_separator", None) self.split_index = getattr(self.parameters, "split_index", 0) self.time_format = getattr(self.parameters, "time_format", None) if self.time_format not in TIME_CONVERSIONS.keys(): raise InvalidArgument('time_format', got=self.time_format, expected=list(TIME_CONVERSIONS.keys()), docs='docs/Bots.md') self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://')
def init(self): if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True)
def init(self): # convert columns to an array if type(self.columns) is str: self.columns = [ column.strip() for column in self.columns.split(",") ] if self.type_translation and isinstance(self.type_translation, str): # not-empty string self.type_translation = json.loads(self.type_translation) elif not self.type_translation: # empty string self.type_translation = {} self.data_type = json.loads(self.data_type or '{}') # prevents empty strings: self.column_regex_search = self.column_regex_search or {} # handle empty strings, false etc. if not self.time_format: self.time_format = None if self.time_format not in TIME_CONVERSIONS.keys(): raise InvalidArgument('time_format', got=self.time_format, expected=list(TIME_CONVERSIONS.keys()), docs=DOCS) if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'): raise InvalidArgument('filter_type', got=self.filter_type, expected=("blacklist", "whitelist"), docs=DOCS) if self.columns_required is None: self.columns_required = [True for _ in self.columns] if len(self.columns) != len(self.columns_required): raise ValueError( "Length of parameters 'columns' (%d) and 'columns_required' (%d) " "needs to be equal." % (len(self.columns), len(self.columns_required))) self.compose = self.compose_fields or {}
def init(self): if bs is None: raise MissingDependencyError("beautifulsoup4") self.columns = self.parameters.columns # convert columns to an array if type(self.columns) is str: self.columns = [ column.strip() for column in self.columns.split(",") ] self.ignore_values = getattr(self.parameters, "ignore_values", len(self.columns) * ['']) if type(self.ignore_values) is str: self.ignore_values = [ value.strip() for value in self.ignore_values.split(",") ] if len(self.columns) != len(self.ignore_values): raise ValueError( "Length of parameters 'columns' and 'ignore_values' is not equal." ) self.table_index = getattr(self.parameters, "table_index", 0) self.attr_name = getattr(self.parameters, "attribute_name", None) self.attr_value = getattr(self.parameters, "attribute_value", None) self.skip_head = getattr(self.parameters, "skip_table_head", True) self.skip_row = 1 if self.skip_head else 0 self.split_column = getattr(self.parameters, "split_column", None) self.split_separator = getattr(self.parameters, "split_separator", None) self.split_index = getattr(self.parameters, "split_index", 0) self.time_format = getattr(self.parameters, "time_format", None) if self.time_format and self.time_format.split( '|')[0] not in DateTime.TIME_CONVERSIONS.keys(): raise InvalidArgument( 'time_format', got=self.time_format, expected=list(DateTime.TIME_CONVERSIONS.keys()), docs= 'https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser' ) self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://') self.parser = getattr(self.parameters, 'html_parser', 'html.parser')
def init(self): ignore = self.gaierrors_to_ignore if not ignore: # for null/None/empty lists or strings ignore = () elif not isinstance(ignore, (list, tuple)): # convert to str to support int-input, e.g. a single value ignore = str(ignore).split(',') # otherwise an iterable (list) ignore = tuple(x.strip() for x in ignore) # check if every element is an integer: for x in ignore: try: int(x) except TypeError: raise InvalidArgument(argument='gaierrors_to_ignore', got=x, expected='int', docs='the bot documentation.') ignore = tuple(int(x) for x in ignore) # convert to integers self.ignore = (-2, -4, -5, -8, -11) + ignore
def init(self): if url_normalize is None: raise ValueError("Could not import 'url-normalize'. Please install it.") if get_tld is None: raise ValueError("Could not import 'tld'. Please install it.") update_tld_names() self.domain_whitelist = [] if getattr(self.parameters, "domain_whitelist", '') != '': self.domain_whitelist.extend(self.parameters.domain_whitelist.split(',')) self.substitutions = [] if getattr(self.parameters, "substitutions", '') != '': temp = self.parameters.substitutions.split(';') if len(temp) % 2 != 0: raise InvalidArgument( 'substitutions', got=self.parameters.substitutions, expected="even number of ; separeted strings") for i in range(int(len(temp) / 2)): self.substitutions.append([temp[2 * i], temp[2 * i + 1]]) self.classification_type = getattr(self.parameters, "classification_type", "unknown") if not ClassificationType.is_valid(self.classification_type): self.classification_type = 'unknown'
def init(self): if bs is None: raise MissingDependencyError("beautifulsoup4") # convert columns to an array if type(self.columns) is str: self.columns = [column.strip() for column in self.columns.split(",")] if self.ignore_values is None: self.ignore_values = len(self.columns) * [''] if type(self.ignore_values) is str: self.ignore_values = [value.strip() for value in self.ignore_values.split(",")] if len(self.columns) != len(self.ignore_values): raise ValueError("Length of parameters 'columns' and 'ignore_values' is not equal.") self.attr_name = self.attribute_name self.attr_value = self.attribute_value self.skip_head = self.skip_table_head self.skip_row = 1 if self.skip_head else 0 if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys(): raise InvalidArgument('time_format', got=self.time_format, expected=list(DateTime.TIME_CONVERSIONS.keys()), docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
def init(self): super().init() if not getattr(self.parameters, 'attach_regex', None): raise InvalidArgument('attach_regex', expected='string')
def init(self): super().init() if self.attach_regex is None: raise InvalidArgument('attach_regex', expected='string')