Example #1
0
    def __init__(self, model: dict):

        # Load external
        if model:
            if isinstance(model, str):
                _model = detect_actions(model)

                if not model:
                    raise CrawlinoValueError("Invalid model input values",
                                             exc_info=True,
                                             extra={"input_model": model})
                else:
                    model = _model

            # Inline declaration
            else:

                self.name = gt(model, "name", None)

                if not self.name:
                    raise CrawlinoValueError("Error in Models: Models must "
                                             "have 'type' property.")

                self.fields = CMModelsFields(gt(model, "fields", None))
                self.mappers = CMModelsMappers(gt(model, "mappers", None))
Example #2
0
    def __init__(self, type: str, config: Dict or None, name: str = None):
        self.type = type
        self.name = name or ""
        self.config = config or {}

        if CrawlinoModulesStore.find_module("hooks", self.type) is None:
            raise CrawlinoValueError("Invalid 'type' property value",
                                     exc_info=True,
                                     extra={
                                         "given_source_type": self.type
                                     })

        if self.config is None:
            raise CrawlinoValueError("Input must has a 'config' property")
Example #3
0
    def __init__(self, type: str, config: Dict or None, name: str = None):
        self.type = type
        self.name = name or ""
        self.config = config or {}

        if not self.type:
            raise CrawlinoValueError("Config must has the 'type' property")

        if self.config is None:
            raise CrawlinoValueError("Source must has a 'config' property")

        if CrawlinoModulesStore.find_module(STEP_EXTRACTOR, self.type) is None:
            raise CrawlinoValueError(
                f"Invalid 'type' property value: "
                f"'{self.type}'",
                exc_info=True,
                extra={"input_type": self.type})
Example #4
0
    def __init__(self, fields: List[dict]):
        self._raw_data = fields

        self.mappers = {}

        for m in fields:
            # Get the key
            ks = list(m.keys())

            if len(ks) != 1:
                raise CrawlinoFormatError(
                    "Invalid mapper format. Each map, "
                    "only can have one dictionary value",
                    exc_info=True,
                    extra={"map_value": str(m)})

            key_action = ks[0]

            # Determinate what sub-class build
            try:
                map_obj = self.MAPPERS[key_action](**m[key_action])
            except KeyError:
                raise CrawlinoValueError("Invalid mapper",
                                         exc_info=True,
                                         extra={"mapper_name": key_action})
            except TypeError as e:
                invalid_arg = e.args[0][e.args[0].rfind("argument") +
                                        len("argument"):]

                raise CrawlinoValueError(
                    "Invalid mapper. Mapper destination "
                    "doesn't required property",
                    exc_info=True,
                    extra={
                        "invalid_property": invalid_arg,
                        "mapper_name": key_action
                    })

            # Storage the object
            self.mappers[map_obj.name] = map_obj
Example #5
0
def generator_numeric(*args, **kwargs):
    """This generator create sequences of numbers from: art[0] to arg[1]"""
    log.debug("Numeric generator plugin")

    start, end, *_ = args

    if start > end:
        raise CrawlinoValueError(
            f"Start range in higher than lower, no data could be generated - "
            f"start: {start} - end: {end}")

    for x in range(start, end):
        yield x
Example #6
0
    def __init__(self, config: dict):
        #
        # Mandatory args
        #
        for x in ("name", "mapTo", "inputVar"):
            try:
                setattr(self, un_camel(x), config[x])
            except KeyError:
                raise CrawlinoValueError(
                    f"Keyword '{x}' is necessary in the ruleSet definition")

        try:
            raw_rules = config["rules"]
        except KeyError:
            raise CrawlinoValueError(
                f"you must define at least one rule in a 'ruleSet' entry")
        else:
            self.rules = OrderedDict()
            for i, rule in enumerate(raw_rules):
                position = str(rule.get("config", {}).get("order", i))

                if position in self.rules.keys():
                    raise CrawlinoValueError(
                        f"conflict in order parameter for rules in ruleSet "
                        f"'{self.name}': already is an element with "
                        f"position '{position}'")

                self.rules[position] = CMRule(type=rule.get("type"),
                                              config=rule.get("config"),
                                              name=rule.get("name"))

        #
        # Optional
        #
        self.description = config.get("description", "")
        self.exit_on_match = config.get("exitOnMatch", True)
        self.report = config.get("report", "group")
Example #7
0
def generator_random(*args, **kwargs):
    """
    Input parameters:

    arg[0]: generated type: string, number
    arg[1]: generated value len
    arg[2]: total of random values generated
    """
    generated_type, maximum, total = args

    space = string.digits
    if generated_type == "string":
        space += string.ascii_letters

    if total <= 0:
        raise CrawlinoValueError(
            f"Total generated values must be bigger than 0")

    for _ in range(total):
        yield "".join(random.choice(space) for _ in range(maximum))
Example #8
0
def input_raw_socket(prev_step: PluginReturnedData, **kwargs) \
        -> PluginReturnedData:

    log.debug("Starting plugin - input::raw-socket")
    allowed_inputs = ("ip", "web", "domain", "url")
    allowed_proto = ("tcp", "udp")

    # Load data
    prev_config = dict_to_object(prev_step.to_dict)

    if prev_config.source_type not in allowed_inputs:
        raise CrawlinoValueError(f"This plugin only works with: "
                                 f"{'|'.join(allowed_inputs)}")

    # -------------------------------------------------------------------------
    # Extract config
    # -------------------------------------------------------------------------
    config = kwargs.get("config", {})
    port_to_test = config.get("port", None)
    data_to_send = config.get("data", None)
    connection_timeout = config.get("timeout", "0.5")
    port_proto = "tcp"

    #
    # Check proto
    #
    if config.get("proto", None):
        if config.get("proto", None) not in allowed_proto:
            raise CrawlinoValueError(f"This plugin only works with: "
                                     f"{'|'.join(allowed_proto)}")

        port_proto = config.get("proto")

    if port_proto == "tcp":
        proto = socket.SOCK_STREAM
    else:
        proto = socket.SOCK_DGRAM

    #
    # Checking timeout
    #
    try:
        timeout = float(connection_timeout)
    except ValueError:
        raise CrawlinoValueError(
            "Invalid timeout value. It must be a float falue")

    #
    # Extract target
    #
    if prev_config.source_type == "ip":
        ip = prev_config.target
    else:
        ip, *_ = urllib.parse.urlparse(prev_config.target).netloc.split(":")

    #
    # Do connection
    #
    if not data_to_send:
        data_to_send = b"\r\n\r\n"
    else:
        data_to_send = data_to_send.encode()

    log.debug(f"Connecting to {ip}:{port_to_test}...")
    with socket.socket(socket.AF_INET, proto) as s:
        s.settimeout(timeout)
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        code = s.connect_ex((ip, int(port_to_test)))

        if code == 0:  # 0 = Open
            try:
                s.sendall(data_to_send)
                d, _, _, _ = s.recvmsg(100000)
                received_data = d.decode(errors="ignore")
                status = "open"
            except socket.timeout:
                log.error(f"Port {port_to_test} is open but it got a "
                          f"timeout when try to get data from socket")
        else:
            received_data = None
            status = "closed/filtered"

    d = PluginReturnedData(
        **dict(host=ip, status=status, data=received_data, port=port_to_test))

    return d
Example #9
0
    def __init__(self,
                 paths: str or List[str],
                 default_crawler_extension: str = "yaml",
                 concurrency: int = 1,
                 concurrency_type: str = "threads",
                 environment_vars: List[str] = None,
                 environment_file: str = None,
                 crawlers_templates_path: List[str] or None = None):
        if not default_crawler_extension:
            self.default_crawler_extension: str = "yaml"
        else:
            self.default_crawler_extension: str = default_crawler_extension

        # ---------------------------------------------------------------------
        # Paths
        # ---------------------------------------------------------------------
        if isinstance(paths, list):
            tmp_paths = paths
        else:
            tmp_paths = [paths if paths else ""]

        # Expand any Glob in paths: *.py -> 1.py, 2.py...
        self.paths = [op.abspath(e) for x in tmp_paths for e in glob.glob(x)]

        try:
            con = int(concurrency)
        except ValueError:
            con = 1
        self.concurrency: int = 1 if con < 1 else con

        if concurrency_type not in self.CONCURRENCY_MODES:
            raise CrawlinoValueError(f"Invalid concurrency type. Allowed types"
                                     f" are: "
                                     f"{'|'.join(self.CONCURRENCY_MODES)}")

        self.concurrency_type = concurrency_type

        self.crawlers_templates_path = [
            op.abspath(op.join(op.dirname(__file__),
                               "..",
                               "crawlers_templates"))
        ]

        if crawlers_templates_path:
            if not isinstance(crawlers_templates_path, list):
                crawlers_templates_path = [crawlers_templates_path]

            self.crawlers_templates_path.extend(crawlers_templates_path)

        # ---------------------------------------------------------------------
        # Set environment vars
        # ---------------------------------------------------------------------
        self.environment_vars = []

        self.environment_file = environment_file
        if self.environment_file:
            self.environment_file = op.abspath(environment_file)

            with open(self.environment_file, "r") as f:
                self.environment_vars.extend(f.read().splitlines())

        if environment_vars:
            self.environment_vars.extend(environment_vars)

        # Remove duplicates
        self.environment_vars = list(set(self.environment_vars))

        if self.environment_vars:
            for v in self.environment_vars:
                if "=" not in v:
                    raise CrawlinoFormatError(
                        f"Environment vars must be set as format: VAR=VALUE. "
                        f"Got: '{v}'")

                try:
                    var_name, var_value = v.split("=")
                except ValueError:
                    raise CrawlinoFormatError(
                        f"Environment vars must be set as format: VAR=VALUE. "
                        f"Got: '{v}'")

                log.debug(f"Setting environment var '{var_name}' with value "
                          f"'{var_value}'")

                os.environ[var_name] = var_value

        log.info(f"Working mode '{self.concurrency_type}' with "
                 f"concurrency '{self.concurrency}'")

        log.info(f"Selected {len(self.crawlers_templates_path)} "
                 f"crawlers paths")
        log.info(f"Default crawler extension selected: "
                 f"'{self.default_crawler_extension}")