def run(self):
        """
        Runs the clean algorythm.
        """
        self.cc_log("INFO", "Data Processing Clean: Started")

        if self.format.lower() == "json":
            if self.drop and isinstance(self.drop, str):
                self.drop = [self.drop]
            if self.keep and isinstance(self.keep, str):
                self.keep = [self.keep]
            json_fr = json_file_reader(self.src)
            json_fw = json_file_writer(self.target)

            self.cc_log("INFO", "Started to clean line for line, please wait!")

            while not json_fr.isEOF():
                data = json_fr.readRecord()
                keepLine, cleaned_line = self.clean_json(data)
                self.cc_log("DEBUG", cleaned_line)
                if keepLine:
                    json_fw.writeRecord(cleaned_line)

            json_fr.close()
            json_fw.close()
        else:
            raise NotImplementedError(
                "The defined format is not implement yet. Please add!")

        self.cc_log("INFO", "Data Processing Clean: Finished")
        return True
Beispiel #2
0
    def run(self):
        """
        Runs the join algorythm.
        """
        self.cc_log("INFO", "Data Processing Join: Started")
        if self.left_joinon and isinstance(self.left_joinon, str):
            self.left_joinon = [self.left_joinon]
        if self.right_joinon and isinstance(self.right_joinon, str):
            self.right_joinon = [self.right_joinon]

        # Create the B-Tree for quick and easy search
        b_tree = genBTree(self.joinwith, self.left_joinon)

        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)

        # Loop through all the left table
        failed_counter = 0
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            key = keyGen(self.right_joinon, data)
            (data, b_tree, failed_counter) = self.join(b_tree, key, data,
                                                       failed_counter)
            json_fw.writeRecord(data)

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO", "%i (btree) & %i (keyerror) records could not be mached" %
            (len(b_tree), failed_counter))
        self.cc_log("INFO", "Data Processing Join: Finished")
        return True
    def process_ip_lookup_data(self, lookup_data):
        """
		Writes an ip lookup dataset to the target file if not already processed and sets it as processed after.

		**Parameters**:
			lookup_data : dict
				single ip lookup dataset received via shodan api.

		**Returns**:
			``True`` if the ip lookup data was written.
			``False`` if the lookup was already processed and not written.
		"""
        data_ts = lookup_data["timestamp"]

        # Check if we did process the timestamp before but with another targetname
        processed_ts = self.kv_store.get("processed_ts",
                                         section=self.moduleName)
        if not processed_ts: processed_ts = []

        if data_ts not in processed_ts:
            self.cc_log(
                "INFO",
                'Banner data for TS %s has not been processed yet' % (data_ts))

            json_fw = json_file_writer(self.target)
            json_fw.writeRecord(lookup_data)
            json_fw.close()

            processed_ts.append(data_ts)
            self.kv_store.put("processed_ts",
                              processed_ts,
                              section=self.moduleName,
                              force=True)

            # Save the newest processed timestamp to be able to tell times between last run and the current run
            newest_processed_ts = self.kv_store.get("newest_processed_ts",
                                                    section=self.moduleName)
            if not newest_processed_ts or self.shodan_ts_is_newer(
                    data_ts, newest_processed_ts):
                self.kv_store.put("newest_processed_ts",
                                  data_ts,
                                  section=self.moduleName,
                                  force=True)

            # Save the target file for an explicit timestamp to be able to tell the file if it was already processed
            self.kv_store.put(data_ts,
                              self.target,
                              section=self.moduleName,
                              force=True)

            return True
        else:
            original_target = self.kv_store.get(data_ts,
                                                section=self.moduleName)
            self.cc_log(
                "WARNING",
                "Dataset with the TS %s has been already processed with the target %s - skip rest of the path!"
                % (data_ts, original_target))
            return False
Beispiel #4
0
    def run(self):
        """
        Runs the classing algorythm.
        """
        self.cc_log("INFO", "Data Processing Classing: Started")
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        while not json_fr.isEOF():
            record = json_fr.readRecord()
            classes = self.getClasses(record)
            record['classes'] = classes
            json_fw.writeRecord(record)

        json_fr.close()
        json_fw.close()

        self.cc_log("INFO", "Data Processing Classing: Finished")
        return True
    def run(self):
        """
        Runs the group algorythm.
        """
        self.cc_log("INFO", "Data Processing Group: Started")
        data_dict = {}
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        # load data
        self.cc_log("DEBUG", "Started to group, please wait...!")
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            for attribute in self.groupBy.split('.'):
                data = data.get(attribute, {})

            if not data:
                self.cc_log("DEBUG", "Skip a line, attribute was not found!")
                continue  # Skip as attribute seems to not be found

            # check if the groupRegex is set and get the first group of it
            if self.groupRegex:
                data = re.search(self.groupRegex, data)
                if not data or not data.group(0):
                    data = "others"
                else:
                    data = data.group(0)
                self.cc_log("DEBUG", "Regex grouped %s" % data)

            if data in data_dict:
                data_dict[data] += 1
            else:
                data_dict[data] = 1

        for entry in self.dictToList(data_dict):
            json_fw.writeRecord(entry)

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO", "Data Processing Group: Aggregated the data set into " +
            str(len(data_dict.keys())) + " data entries")
        self.cc_log("INFO", "Data Processing Group: Finished")
        return True
    def run_shodan_search_query(self):
        """
		Runs the shodan api search query lookup.

		**Returns**:
			``True`` if the search query lookup was successfull and the data written.
			``False`` if the lookup failed.
		"""
        self.cc_log(
            "INFO",
            "Data Store Shodan: Started Search Query Lookup With Query '%s'" %
            self.query)

        s_api = shodan.Shodan(self.apiKey)
        json_fw = json_file_writer(self.target)

        counter = 0
        for banner in s_api.search_cursor(self.query,
                                          minify=self.minify,
                                          retries=self.retries):
            json_fw.writeRecord(banner)

            counter += 1
            self.cc_log("DEBUG", "Data amount: %s!" % (counter))
            if counter >= self.limit:
                break

        json_fw.close()
        if counter > 0:
            self.cc_log(
                "INFO",
                "A total of %s banner data was downloaded!" % (counter))
            return True

        self.cc_log("WARNING",
                    "No data was downloaded via search cursor lookup!")
        return False
Beispiel #7
0
    def run(self):
        """
        Runs the filter algorythm.
        """
        self.cc_log("INFO", "Data Processing Filter: Started")
        count = 0
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        # load data
        self.cc_log("DEBUG", "Started to filter, please wait...!")
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            if self.filter(data):
                json_fw.writeRecord(data)
            else:
                count += 1

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO",
            "Data Processing Filter: Filtered " + str(count) + " data sets")
        self.cc_log("INFO", "Data Processing Filter: Finished")
        return True
Beispiel #8
0
    def run(self):
        """
        Runs the diff algorythm.

        **Returns**:
            ``True`` if the run works fine.
        """
        self.cc_log("INFO", "Data Processing Diff: Started")
        if self.attributes_diff and isinstance(self.attributes_diff, str):
            self.attributes_diff = [self.attributes_diff]
        if self.key_attributes and isinstance(self.key_attributes, str):
            self.key_attributes = [self.key_attributes]

        # if the target does not exist create the file and add all the data
        if not path.isfile(self.target):
            json_fr = json_file_reader(self.src)
            self.cc_log("DEBUG", "Opened source file")
            json_fw = json_file_writer(self.target)
            self.cc_log("DEBUG", "Opened target file - please have patience")
            while not json_fr.isEOF():
                data = json_fr.readRecord()
                data = self.genDataSet(keyGen(self.key_attributes, data), data,
                                       self.attributes_diff)
                json_fw.writeRecord(data)
            json_fr.close()
            json_fw.close()
        # else create a B-Tree out of the src file with the nessecary data
        else:
            self.cc_log(
                "DEBUG",
                "Generating B-Tree for the diff - please have patience")
            b_tree = genBTree(self.src, self.key_attributes)
            # move the old target so it can be read from and does not collide with the writer
            old_target = self.target + '.old'
            move(self.target, old_target)
            json_fr = json_file_reader(old_target)
            json_fw = json_file_writer(self.target)
            self.cc_log("INFO",
                        "Started to generate the diff - please have patience")
            while not json_fr.isEOF():
                old_data = json_fr.readRecord()
                try:  # update all the data
                    new_data = b_tree.pop(old_data["cc_id"])
                    diff_data = self.getDataByAttributes(
                        self.attributes_diff, new_data)
                    old_data = self.compareData(old_data, diff_data)
                except KeyError:  # if the id cannot be found it must be delete
                    old_data["cc_status"] = "delete"
                    old_data["cc_time_id"] = self.time_id
                json_fw.writeRecord(old_data)
            # add the left over data
            self.cc_log("INFO", "Adding leftover data...")
            while b_tree:
                key = b_tree.minKey()
                data = self.genDataSet(key, b_tree.pop(key),
                                       self.attributes_diff)
                json_fw.writeRecord(data)

            remove(old_target)
            json_fr.close()
            json_fw.close()

        self.kv_store.put(key="diff_last_src",
                          value=(self.time_id),
                          section=self.moduleName,
                          force=True)
        self.cc_log("INFO", "Data Processing Diff: Finished")
        return True
    def run(self):
        """
        Runs the clean algorythm.

        **Returns**:
            ``True`` if this run succeeded.
            ``False`` if this run did not succeed.
        """
        self.cc_log("INFO", "Data Processing Country: Started")

        self.cc_log(
            "DEBUG",
            "Trying to open the MaxMind GeoLite2-Country DB, please wait!")
        try:
            db = geoip2.database.Reader(self.max_mind_db_path)
        except Exception as e:
            self.logger.exception(e)
            self.cc_log(
                "ERROR",
                "Failed to open the MaxMind GeoLite2-Country DB at %s - please check the file!"
                % (self.max_mind_db_path))
            return False
        self.cc_log("DEBUG", "Opened the MaxMindGeoLite2-Country DB!")

        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)

        self.cc_log(
            "INFO",
            "Started to lookup ips and write into the target, please wait!")

        while not json_fr.isEOF():
            data = json_fr.readRecord()

            country_code = "-99"
            found_ip = data
            for attribute in self.ip_input_attribute.split('.'):
                found_ip = found_ip[attribute]

            if not found_ip or found_ip == data:
                self.cc_log(
                    "WARNING",
                    "No IP found at the give ipInputAttribute place - Add country code -99 to this dataset!"
                )
            else:
                # Lookup ip for country
                try:
                    ip_info = db.country(found_ip)
                    if ip_info.country.iso_code:
                        country_code = ip_info.country.iso_code
                    self.cc_log(
                        "DEBUG", "Found country code %s for ip %s" %
                        (ip_info.country.iso_code, found_ip))
                except Exception as e:
                    self.cc_log(
                        "WARNING",
                        "No country code found for ip %s - add -99 to country code"
                        % (found_ip))

            data[self.output_attribute] = country_code
            json_fw.writeRecord(data)

        json_fr.close()
        json_fw.close()
        db.close()

        self.cc_log("INFO", "Data Processing Country: Finished")
        return True
 def setUp(self):
     if not os.path.exists(TESTDATA_GEN_OUTPUT_FOLDER):
         os.makedirs(TESTDATA_GEN_OUTPUT_FOLDER)
     self.fw = json_file_writer(TESTDATA_TARGET_FILENAME)