Example #1
0
    def init(self):
        self.columns = self.parameters.columns
        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [
                column.strip() for column in self.columns.split(",")
            ]

        self.type_translation = json.loads(
            getattr(self.parameters, 'type_translation', None) or '{}')
        self.data_type = json.loads(
            getattr(self.parameters, 'data_type', None) or '{}')

        # prevents empty strings:
        self.column_regex_search = getattr(self.parameters,
                                           'column_regex_search', None) or {}

        self.time_format = getattr(self.parameters, "time_format", None)
        if self.time_format not in TIME_CONVERSIONS.keys():
            raise InvalidArgument('time_format',
                                  got=self.time_format,
                                  expected=list(TIME_CONVERSIONS.keys()),
                                  docs='docs/Bots.md')
        self.filter_text = getattr(self.parameters, 'filter_text', None)
        self.filter_type = getattr(self.parameters, 'filter_type', None)
        if self.filter_type and self.filter_type not in ('blacklist',
                                                         'whitelist'):
            raise InvalidArgument('filter_type',
                                  got=self.filter_type,
                                  expected=("blacklist", "whitelist"),
                                  docs='docs/Bots.md')
Example #2
0
    def init(self):
        self.columns = self.parameters.columns
        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [column.strip() for column in self.columns.split(",")]

        self.type_translation = getattr(self.parameters, 'type_translation', {})
        if self.type_translation and isinstance(self.type_translation, str):  # not-empty string
            self.type_translation = json.loads(self.type_translation)
        elif not self.type_translation:  # empty string
            self.type_translation = {}
        self.data_type = json.loads(getattr(self.parameters, 'data_type', None) or '{}')

        # prevents empty strings:
        self.column_regex_search = getattr(self.parameters, 'column_regex_search', None) or {}

        self.time_format = getattr(self.parameters, "time_format", None)
        if self.time_format not in TIME_CONVERSIONS.keys():
            raise InvalidArgument('time_format', got=self.time_format,
                                  expected=list(TIME_CONVERSIONS.keys()),
                                  docs='docs/Bots.md')
        self.filter_text = getattr(self.parameters, 'filter_text', None)
        self.filter_type = getattr(self.parameters, 'filter_type', None)
        if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'):
            raise InvalidArgument('filter_type', got=self.filter_type,
                                  expected=("blacklist", "whitelist"),
                                  docs='docs/Bots.md')
        self.columns_required = getattr(self.parameters, 'columns_required',
                                        [True for _ in self.columns])
        if len(self.columns) != len(self.columns_required):
            raise ValueError("Length of parameters 'columns' (%d) and 'columns_required' (%d) "
                             "needs to be equal." % (len(self.columns), len(self.columns_required)))
    def init(self):
        super().init()
        if self.repository is not None:
            self.__base_api_url = 'https://api.github.com/repos/{}/contents'.format(
                self.repository)
        else:
            raise InvalidArgument('repository', expected='string')

        if self.regex is not None:
            try:
                re.compile(self.regex)
            except Exception:
                raise InvalidArgument('regex',
                                      expected='string',
                                      got=self.regex)
        else:
            raise InvalidArgument('regex', expected='string', got=None)

        if self.extra_fields is not None:
            try:
                self.__extra_fields = [
                    x.strip() for x in self.extra_fields.split(',')
                ]
            except Exception:
                raise InvalidArgument('extra_fields',
                                      expected='comma-separated list')
        else:
            self.__extra_fields = []
Example #4
0
 def init(self):
     super().init()
     if hasattr(self.parameters, 'repository'):
         self.__base_api_url = 'https://api.github.com/repos/{}/contents'.format(
             getattr(self.parameters, 'repository'))
     if hasattr(self.parameters, 'regex'):
         try:
             re.compile(getattr(self.parameters, 'regex'))
         except Exception:
             raise InvalidArgument('regex',
                                   expected='string',
                                   got=getattr(self.parameters, 'regex'))
     else:
         raise InvalidArgument('regex', expected='string', got=None)
     if not hasattr(self.parameters, 'repository'):
         raise InvalidArgument('repository', expected='string')
     if hasattr(self.parameters, 'extra_fields'):
         try:
             self.__extra_fields = [
                 x.strip() for x in getattr(self.parameters,
                                            'extra_fields').split(',')
             ]
         except Exception:
             raise InvalidArgument('extra_fields',
                                   expected='comma-separated list')
     else:
         self.__extra_fields = []
Example #5
0
    def init(self):
        # although True is the default value, we leave False here for backwards compatibility
        self.fallback_to_url = getattr(self.parameters, 'fallback_to_url',
                                       False)

        ignore = getattr(self.parameters, 'gaierrors_to_ignore', ())
        if not ignore:  # for null/None/empty lists or strings
            ignore = ()
        elif not isinstance(ignore, (list, tuple)):
            ignore = ignore.split(',')
        # otherwise a string
        ignore = tuple(x.strip() for x in ignore)
        # check if every element is an integer:
        for x in ignore:
            try:
                int(x)
            except TypeError:
                raise InvalidArgument(argument='gaierrors_to_ignore',
                                      got=x,
                                      expected='int',
                                      docs='the bot documentation.')
        ignore = tuple(int(x) for x in ignore)  # convert to integers

        self.ignore = (-2, -4, -5, -8, -11) + ignore
        self.overwrite = getattr(self.parameters, 'overwrite', False)
Example #6
0
    def init(self):
        if url_normalize is None:
            raise MissingDependencyError("url-normalize")
        url_version = pkg_resources.get_distribution("url-normalize").version
        if tuple(int(v) for v in url_version.split('.')) < (
                1, 4, 1) and self.default_scheme is not None:
            raise ValueError(
                "Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
                "Get at least version '1.4.1'." % url_version)
        if get_tld is None:
            raise MissingDependencyError("tld")
        try:
            update_tld_names()
        except tld.exceptions.TldIOError:
            self.logger.info("Could not update TLD names cache.")
        if self.domain_whitelist != '':
            self._domain_whitelist.extend(self.domain_whitelist.split(','))
        if self.substitutions != '':
            temp = self.substitutions.split(';')
            if len(temp) % 2 != 0:
                raise InvalidArgument(
                    'substitutions',
                    got=self.substitutions,
                    expected="even number of ; separated strings")
            for i in range(int(len(temp) / 2)):
                self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
        if not ClassificationType.is_valid(self.classification_type):
            self.classification_type = 'unknown'

        if self.default_scheme is not None:
            self.url_kwargs = {'default_scheme': self.default_scheme}
        else:
            self.url_kwargs = {}
Example #7
0
    def init(self):
        if bs is None:
            raise ValueError("Could not import 'beautifulsoup4'. Please install it.")

        self.columns = self.parameters.columns
        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [column.strip() for column in self.columns.split(",")]
        self.ignore_values = getattr(self.parameters, "ignore_values", len(self.columns) * [''])
        if type(self.ignore_values) is str:
            self.ignore_values = [value.strip() for value in self.ignore_values.split(",")]

        if len(self.columns) != len(self.ignore_values):
            raise ValueError("Length of parameters 'columns' and 'ignore_values' is not equal.")

        self.table_index = getattr(self.parameters, "table_index", 0)
        self.attr_name = getattr(self.parameters, "attribute_name", None)
        self.attr_value = getattr(self.parameters, "attribute_value", None)
        self.skip_head = getattr(self.parameters, "skip_table_head", True)
        self.skip_row = 1 if self.skip_head else 0
        self.split_column = getattr(self.parameters, "split_column", None)
        self.split_separator = getattr(self.parameters, "split_separator", None)
        self.split_index = getattr(self.parameters, "split_index", 0)

        self.time_format = getattr(self.parameters, "time_format", None)
        if self.time_format not in TIME_CONVERSIONS.keys():
            raise InvalidArgument('time_format', got=self.time_format,
                                  expected=list(TIME_CONVERSIONS.keys()),
                                  docs='docs/Bots.md')
        self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://')
Example #8
0
 def init(self):
     if self.field not in ALLOWED_FIELDS:
         raise InvalidArgument('key',
                               got=self.field,
                               expected=ALLOWED_FIELDS)
     with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle:
         self.psl = PublicSuffixList(source=file_handle, only_icann=True)
Example #9
0
    def init(self):
        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [
                column.strip() for column in self.columns.split(",")
            ]

        if self.type_translation and isinstance(self.type_translation,
                                                str):  # not-empty string
            self.type_translation = json.loads(self.type_translation)
        elif not self.type_translation:  # empty string
            self.type_translation = {}
        self.data_type = json.loads(self.data_type or '{}')

        # prevents empty strings:
        self.column_regex_search = self.column_regex_search or {}

        # handle empty strings, false etc.
        if not self.time_format:
            self.time_format = None
        if self.time_format not in TIME_CONVERSIONS.keys():
            raise InvalidArgument('time_format',
                                  got=self.time_format,
                                  expected=list(TIME_CONVERSIONS.keys()),
                                  docs=DOCS)
        if self.filter_type and self.filter_type not in ('blacklist',
                                                         'whitelist'):
            raise InvalidArgument('filter_type',
                                  got=self.filter_type,
                                  expected=("blacklist", "whitelist"),
                                  docs=DOCS)

        if self.columns_required is None:
            self.columns_required = [True for _ in self.columns]
        if len(self.columns) != len(self.columns_required):
            raise ValueError(
                "Length of parameters 'columns' (%d) and 'columns_required' (%d) "
                "needs to be equal." %
                (len(self.columns), len(self.columns_required)))

        self.compose = self.compose_fields or {}
Example #10
0
    def init(self):
        if bs is None:
            raise MissingDependencyError("beautifulsoup4")

        self.columns = self.parameters.columns
        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [
                column.strip() for column in self.columns.split(",")
            ]
        self.ignore_values = getattr(self.parameters, "ignore_values",
                                     len(self.columns) * [''])
        if type(self.ignore_values) is str:
            self.ignore_values = [
                value.strip() for value in self.ignore_values.split(",")
            ]

        if len(self.columns) != len(self.ignore_values):
            raise ValueError(
                "Length of parameters 'columns' and 'ignore_values' is not equal."
            )

        self.table_index = getattr(self.parameters, "table_index", 0)
        self.attr_name = getattr(self.parameters, "attribute_name", None)
        self.attr_value = getattr(self.parameters, "attribute_value", None)
        self.skip_head = getattr(self.parameters, "skip_table_head", True)
        self.skip_row = 1 if self.skip_head else 0
        self.split_column = getattr(self.parameters, "split_column", None)
        self.split_separator = getattr(self.parameters, "split_separator",
                                       None)
        self.split_index = getattr(self.parameters, "split_index", 0)

        self.time_format = getattr(self.parameters, "time_format", None)
        if self.time_format and self.time_format.split(
                '|')[0] not in DateTime.TIME_CONVERSIONS.keys():
            raise InvalidArgument(
                'time_format',
                got=self.time_format,
                expected=list(DateTime.TIME_CONVERSIONS.keys()),
                docs=
                'https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser'
            )
        self.default_url_protocol = getattr(self.parameters,
                                            'default_url_protocol', 'http://')

        self.parser = getattr(self.parameters, 'html_parser', 'html.parser')
Example #11
0
    def init(self):
        ignore = self.gaierrors_to_ignore
        if not ignore:  # for null/None/empty lists or strings
            ignore = ()
        elif not isinstance(ignore, (list, tuple)):
            # convert to str to support int-input, e.g. a single value
            ignore = str(ignore).split(',')
        # otherwise an iterable (list)
        ignore = tuple(x.strip() for x in ignore)
        # check if every element is an integer:
        for x in ignore:
            try:
                int(x)
            except TypeError:
                raise InvalidArgument(argument='gaierrors_to_ignore',
                                      got=x,
                                      expected='int',
                                      docs='the bot documentation.')
        ignore = tuple(int(x) for x in ignore)  # convert to integers

        self.ignore = (-2, -4, -5, -8, -11) + ignore
Example #12
0
 def init(self):
     if url_normalize is None:
         raise ValueError("Could not import 'url-normalize'. Please install it.")
     if get_tld is None:
         raise ValueError("Could not import 'tld'. Please install it.")
     update_tld_names()
     self.domain_whitelist = []
     if getattr(self.parameters, "domain_whitelist", '') != '':
         self.domain_whitelist.extend(self.parameters.domain_whitelist.split(','))
     self.substitutions = []
     if getattr(self.parameters, "substitutions", '') != '':
         temp = self.parameters.substitutions.split(';')
         if len(temp) % 2 != 0:
             raise InvalidArgument(
                 'substitutions',
                 got=self.parameters.substitutions,
                 expected="even number of ; separeted strings")
         for i in range(int(len(temp) / 2)):
             self.substitutions.append([temp[2 * i], temp[2 * i + 1]])
     self.classification_type = getattr(self.parameters, "classification_type", "unknown")
     if not ClassificationType.is_valid(self.classification_type):
         self.classification_type = 'unknown'
Example #13
0
    def init(self):
        if bs is None:
            raise MissingDependencyError("beautifulsoup4")

        # convert columns to an array
        if type(self.columns) is str:
            self.columns = [column.strip() for column in self.columns.split(",")]
        if self.ignore_values is None:
            self.ignore_values = len(self.columns) * ['']
        if type(self.ignore_values) is str:
            self.ignore_values = [value.strip() for value in self.ignore_values.split(",")]

        if len(self.columns) != len(self.ignore_values):
            raise ValueError("Length of parameters 'columns' and 'ignore_values' is not equal.")

        self.attr_name = self.attribute_name
        self.attr_value = self.attribute_value
        self.skip_head = self.skip_table_head
        self.skip_row = 1 if self.skip_head else 0

        if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
            raise InvalidArgument('time_format', got=self.time_format,
                                  expected=list(DateTime.TIME_CONVERSIONS.keys()),
                                  docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
 def init(self):
     super().init()
     if not getattr(self.parameters, 'attach_regex', None):
         raise InvalidArgument('attach_regex', expected='string')
Example #15
0
 def init(self):
     super().init()
     if self.attach_regex is None:
         raise InvalidArgument('attach_regex', expected='string')