Beispiel #1
0
    def getAttributes(self, src):
        """
        Evaluates the given configuration and parses the attributes nessesary.

        **Parameters**:
			src : str
				The src location and name for opening the file.

        **Returns**:
            Returns a list of all wished attributes.
        """
        if isinstance(self.exported_attributes, list): # just return the given list of attributes
            return self.exported_attributes

        json_fr = json_file_reader(src)
        if isinstance(self.exported_attributes, int): # find the line and it's attributes
            line = json_fr.readLineRecord(self.exported_attributes)
            attributes = self.getKeysFromDict(line)
            json_fr.close()
            return attributes
        if self.exported_attributes == 'all': # find all the attributes of the given file
            attributes = self.getAllKeysFromFile(json_fr)
            json_fr.close()
            return attributes
        else:
            raise ConfigurationError("Unknown attributes to export")
Beispiel #2
0
    def run(self):
        """
        Runs the join algorythm.
        """
        self.cc_log("INFO", "Data Processing Join: Started")
        if self.left_joinon and isinstance(self.left_joinon, str):
            self.left_joinon = [self.left_joinon]
        if self.right_joinon and isinstance(self.right_joinon, str):
            self.right_joinon = [self.right_joinon]

        # Create the B-Tree for quick and easy search
        b_tree = genBTree(self.joinwith, self.left_joinon)

        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)

        # Loop through all the left table
        failed_counter = 0
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            key = keyGen(self.right_joinon, data)
            (data, b_tree, failed_counter) = self.join(b_tree, key, data,
                                                       failed_counter)
            json_fw.writeRecord(data)

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO", "%i (btree) & %i (keyerror) records could not be mached" %
            (len(b_tree), failed_counter))
        self.cc_log("INFO", "Data Processing Join: Finished")
        return True
    def run(self):
        """
        Runs the clean algorythm.
        """
        self.cc_log("INFO", "Data Processing Clean: Started")

        if self.format.lower() == "json":
            if self.drop and isinstance(self.drop, str):
                self.drop = [self.drop]
            if self.keep and isinstance(self.keep, str):
                self.keep = [self.keep]
            json_fr = json_file_reader(self.src)
            json_fw = json_file_writer(self.target)

            self.cc_log("INFO", "Started to clean line for line, please wait!")

            while not json_fr.isEOF():
                data = json_fr.readRecord()
                keepLine, cleaned_line = self.clean_json(data)
                self.cc_log("DEBUG", cleaned_line)
                if keepLine:
                    json_fw.writeRecord(cleaned_line)

            json_fr.close()
            json_fw.close()
        else:
            raise NotImplementedError(
                "The defined format is not implement yet. Please add!")

        self.cc_log("INFO", "Data Processing Clean: Finished")
        return True
Beispiel #4
0
    def get_data_from_files(self, files):
        """
		Gets and extracts the data from the given fileslist.

		**Parameters**:
			files : list
				list of filepaths to process.

        **Returns**:
            ``file_count, names_list, data_dict`` amount of files, names list of the files, grouped data dict with the values scaled in case of missing data
		"""
        data_dict = {}
        names_list = []
        file_count = 0

        for file in files:
            json_fr = json_file_reader(file)
            while not json_fr.isEOF():
                json_data = json_fr.readRecord()

                value = json_data
                for a in self.data_attribute.split('.'):
                    value = value[a]

                # Threshold
                if self.threshold and int(value) < int(self.threshold):
                    continue  # Skip this line as its < threshold

                group_name = json_data
                for a in self.group_name_attribute.split('.'):
                    group_name = group_name[a]

                if group_name in data_dict:
                    data_dict[group_name].append(value)
                else:
                    data_dict[group_name] = [0] * file_count
                    data_dict[group_name].append(value)

            for gn in data_dict:
                if len(
                        data_dict[gn]
                ) < file_count + 1:  # Will not be appended as filecount isnt incremented yet, +1 added
                    data_dict[gn].append(0)

            json_fr.close()

            # Add filenames list to names list or extract regex if defined
            name = None
            if self.filenames_regex_extract:
                name = re.search(self.filenames_regex_extract,
                                 os.path.basename(file))
                if name: name = name.group(0)
            if not name: name = os.path.basename(file)
            names_list.append(name)

            file_count += 1
        return file_count, names_list, data_dict
Beispiel #5
0
    def test_get_all_keys_from_flie(self):
        """
        Tests if all keys from the file are found.
        """
        jfr = json_file_reader(TESTDATA_VALID_PATH)
        exp_val = ["Ship", "Ship.Captain", "Ship.Captain.Name"]
        act_val = self.ec.getAllKeysFromFile(jfr)

        self.assertEqual(
            act_val.sort(), exp_val.sort()
        )  # only the content must be the same, how it is arranged does not matter
Beispiel #6
0
    def plot_histogram(self, files):
        """
		Plots a histogram.

		**Parameters**:
			files : list
				list of file paths.

        **Returns**:
            ``True`` if the plot was successfully saved.
            ``False`` in case something failed.
		"""
        _, ax = plt.subplots()
        values_list = []
        names_list = []
        for file in files:
            json_fr = json_file_reader(file)

            values = []
            while not json_fr.isEOF():
                data = json_fr.readRecord()

                value = data
                for a in self.data_attribute.split('.'):
                    value = value[a]

                # Threshold
                if self.threshold and int(value) < int(self.threshold):
                    continue  # Skip this line as its < threshold

                values.append(value)

            json_fr.close()
            values_list.append(values)
            names_list.append(os.path.basename(file))

        self.set_color_cycle(len(names_list), ax)
        ax.hist(values_list, label=names_list, bins=10, edgecolor='white')
        ax.set_ylabel(self.y_label, fontweight='bold')
        ax.set_xlabel(self.x_label, fontweight='bold')
        ax.set_title(self.title, fontweight='bold')
        if self.show_legend: ax.legend(loc='best')
        if self.show_grid: plt.grid(linestyle='dotted')
        plt.savefig(self.target, bbox_inches='tight')
        plt.close('all')

        return True
Beispiel #7
0
    def run(self):
        """
        Runs the classing algorythm.
        """
        self.cc_log("INFO", "Data Processing Classing: Started")
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        while not json_fr.isEOF():
            record = json_fr.readRecord()
            classes = self.getClasses(record)
            record['classes'] = classes
            json_fw.writeRecord(record)

        json_fr.close()
        json_fw.close()

        self.cc_log("INFO", "Data Processing Classing: Finished")
        return True
    def run(self):
        """
        Runs the group algorythm.
        """
        self.cc_log("INFO", "Data Processing Group: Started")
        data_dict = {}
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        # load data
        self.cc_log("DEBUG", "Started to group, please wait...!")
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            for attribute in self.groupBy.split('.'):
                data = data.get(attribute, {})

            if not data:
                self.cc_log("DEBUG", "Skip a line, attribute was not found!")
                continue  # Skip as attribute seems to not be found

            # check if the groupRegex is set and get the first group of it
            if self.groupRegex:
                data = re.search(self.groupRegex, data)
                if not data or not data.group(0):
                    data = "others"
                else:
                    data = data.group(0)
                self.cc_log("DEBUG", "Regex grouped %s" % data)

            if data in data_dict:
                data_dict[data] += 1
            else:
                data_dict[data] = 1

        for entry in self.dictToList(data_dict):
            json_fw.writeRecord(entry)

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO", "Data Processing Group: Aggregated the data set into " +
            str(len(data_dict.keys())) + " data entries")
        self.cc_log("INFO", "Data Processing Group: Finished")
        return True
def genBTree(src, attributes):
    """
    Generates a B-Tree from the given source file and uses the attributes to generate a key.

    **Parameters**:
        src : str
            the path and file name to the file.
        attributes : list
            the list of attributes which define the key

    **Returns**:
        A complete B-Tree.
    """
    json_fr = json_file_reader(src)
    b_tree = OOBTree()
    while not json_fr.isEOF():
        data = json_fr.readRecord()
        key = keyGen(attributes, data)
        if not key: continue  # Key was not generated, go to next
        b_tree.insert(key, data)

    return b_tree
Beispiel #10
0
    def run(self):
        """
        Runs the csv export algorythm.
        """

        attributes = self.getAttributes(self.src)
        json_fr = json_file_reader(self.src)
        csv_fw = csv_file_writer(self.target, attributes)

        while not json_fr.isEOF():
            line = json_fr.readRecord()
            csv_row = {}
            # flatten the dict so that it can be written into the CSV format
            for key in attributes:
                val = self.getValueFromDict(line, key)
                if not val:
                    val = self.attribute_fill
                csv_row[key] = val
            csv_fw.writeCSVRow(csv_row)

        json_fr.close()
        csv_fw.close()
Beispiel #11
0
    def run(self):
        """
        Runs the filter algorythm.
        """
        self.cc_log("INFO", "Data Processing Filter: Started")
        count = 0
        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)
        # load data
        self.cc_log("DEBUG", "Started to filter, please wait...!")
        while not json_fr.isEOF():
            data = json_fr.readRecord()
            if self.filter(data):
                json_fw.writeRecord(data)
            else:
                count += 1

        json_fr.close()
        json_fw.close()
        self.cc_log(
            "INFO",
            "Data Processing Filter: Filtered " + str(count) + " data sets")
        self.cc_log("INFO", "Data Processing Filter: Finished")
        return True
Beispiel #12
0
    def run(self):
        """
        Runs the diff algorythm.

        **Returns**:
            ``True`` if the run works fine.
        """
        self.cc_log("INFO", "Data Processing Diff: Started")
        if self.attributes_diff and isinstance(self.attributes_diff, str):
            self.attributes_diff = [self.attributes_diff]
        if self.key_attributes and isinstance(self.key_attributes, str):
            self.key_attributes = [self.key_attributes]

        # if the target does not exist create the file and add all the data
        if not path.isfile(self.target):
            json_fr = json_file_reader(self.src)
            self.cc_log("DEBUG", "Opened source file")
            json_fw = json_file_writer(self.target)
            self.cc_log("DEBUG", "Opened target file - please have patience")
            while not json_fr.isEOF():
                data = json_fr.readRecord()
                data = self.genDataSet(keyGen(self.key_attributes, data), data,
                                       self.attributes_diff)
                json_fw.writeRecord(data)
            json_fr.close()
            json_fw.close()
        # else create a B-Tree out of the src file with the nessecary data
        else:
            self.cc_log(
                "DEBUG",
                "Generating B-Tree for the diff - please have patience")
            b_tree = genBTree(self.src, self.key_attributes)
            # move the old target so it can be read from and does not collide with the writer
            old_target = self.target + '.old'
            move(self.target, old_target)
            json_fr = json_file_reader(old_target)
            json_fw = json_file_writer(self.target)
            self.cc_log("INFO",
                        "Started to generate the diff - please have patience")
            while not json_fr.isEOF():
                old_data = json_fr.readRecord()
                try:  # update all the data
                    new_data = b_tree.pop(old_data["cc_id"])
                    diff_data = self.getDataByAttributes(
                        self.attributes_diff, new_data)
                    old_data = self.compareData(old_data, diff_data)
                except KeyError:  # if the id cannot be found it must be delete
                    old_data["cc_status"] = "delete"
                    old_data["cc_time_id"] = self.time_id
                json_fw.writeRecord(old_data)
            # add the left over data
            self.cc_log("INFO", "Adding leftover data...")
            while b_tree:
                key = b_tree.minKey()
                data = self.genDataSet(key, b_tree.pop(key),
                                       self.attributes_diff)
                json_fw.writeRecord(data)

            remove(old_target)
            json_fr.close()
            json_fw.close()

        self.kv_store.put(key="diff_last_src",
                          value=(self.time_id),
                          section=self.moduleName,
                          force=True)
        self.cc_log("INFO", "Data Processing Diff: Finished")
        return True
    def run(self):
        """
        Runs the clean algorythm.

        **Returns**:
            ``True`` if this run succeeded.
            ``False`` if this run did not succeed.
        """
        self.cc_log("INFO", "Data Processing Country: Started")

        self.cc_log(
            "DEBUG",
            "Trying to open the MaxMind GeoLite2-Country DB, please wait!")
        try:
            db = geoip2.database.Reader(self.max_mind_db_path)
        except Exception as e:
            self.logger.exception(e)
            self.cc_log(
                "ERROR",
                "Failed to open the MaxMind GeoLite2-Country DB at %s - please check the file!"
                % (self.max_mind_db_path))
            return False
        self.cc_log("DEBUG", "Opened the MaxMindGeoLite2-Country DB!")

        json_fr = json_file_reader(self.src)
        json_fw = json_file_writer(self.target)

        self.cc_log(
            "INFO",
            "Started to lookup ips and write into the target, please wait!")

        while not json_fr.isEOF():
            data = json_fr.readRecord()

            country_code = "-99"
            found_ip = data
            for attribute in self.ip_input_attribute.split('.'):
                found_ip = found_ip[attribute]

            if not found_ip or found_ip == data:
                self.cc_log(
                    "WARNING",
                    "No IP found at the give ipInputAttribute place - Add country code -99 to this dataset!"
                )
            else:
                # Lookup ip for country
                try:
                    ip_info = db.country(found_ip)
                    if ip_info.country.iso_code:
                        country_code = ip_info.country.iso_code
                    self.cc_log(
                        "DEBUG", "Found country code %s for ip %s" %
                        (ip_info.country.iso_code, found_ip))
                except Exception as e:
                    self.cc_log(
                        "WARNING",
                        "No country code found for ip %s - add -99 to country code"
                        % (found_ip))

            data[self.output_attribute] = country_code
            json_fw.writeRecord(data)

        json_fr.close()
        json_fw.close()
        db.close()

        self.cc_log("INFO", "Data Processing Country: Finished")
        return True
Beispiel #14
0
    def plot_heatmap(self):
        """
        Plot a heatmap with the given defined country and grouped value attribute.

        **Parameters**:
            attribute : str
                Keyname for the wanted geojson map.
        **Returns**:
            ``True`` if the target with the plot was successfully written.
            ``False`` if the plot was not written to the target and it failed.
        """
        plt.rcParams['figure.figsize'] = (20, 10)

        self.cc_log("DEBUG", "Trying to read %s gejson file" % self.geojson_map)
        gp_map = gpd.read_file(self.geojson_map)

        gp_map['grouped_value'] = 0 # Init all shapes on the map with a grouped_value of 0
        gp_map['centroid'] = gp_map['geometry'].centroid # Set the center value on all shapes for labels

        # Loop through source file and read the given country code & grouped value attributes and set them on the map
        self.cc_log("DEBUG", "Trying to read %s src file" % self.src)
        json_fr = json_file_reader(self.src)

        self.cc_log("DEBUG", "Creating the heatmap...")
        while not json_fr.isEOF():
            data = json_fr.readRecord()

            country_code = data
            for a in self.country_code_attribute.split('.'):
                country_code = country_code[a]

            grouped_value = data
            for a in self.grouped_value_attribute.split('.'):
                grouped_value = grouped_value[a]

            # Geopandas method to set the grouped value to where the given country code matches
            
            # Check country code if ISO_A2 or ISO_A3 or undefined (-99) or something else
            country_code = str(country_code)
            if country_code == "-99" or country_code == "null" or country_code == "None":
                self.cc_log("WARNING", "There is an undefined country code, we skip this dataset - Please recheck to have an accurate plot!")
                continue

            if len(country_code) > 3 or len(country_code) < 2:
                self.cc_log("ERROR", "The given country code (%s) has a length of %s which is not a valid iso3166_A3 or iso3166_A2 code - Please recheck!" % (country_code, len(country_code)))
                return False

            if len(country_code) == 2:
                # ISO_A2 Code - Try to convert
                country_code_old = country_code
                if country_code not in self.alpha2_country_codes:
                    self.cc_log("WARNING", "There given iso3166 alpha2 code (%s) does not match any alpha3 code, we skip this dataset - Please recheck to have an accurate plot!" % (country_code))
                    continue

                country_code = self.alpha2_country_codes[country_code].alpha3
                self.cc_log("DEBUG", "Converted alpha2 country code '%s' to alpha3 code '%s'" % (country_code_old, country_code))

            gp_map.loc[gp_map['ISO_A3'] == country_code, 'grouped_value'] = int(grouped_value)

        json_fr.close()

        # Plot the map
        fig, ax = plt.subplots(1)
        gp_map.plot(ax=ax, column='grouped_value', cmap=self.colormap, edgecolor='black', linewidth=0.2, legend=self.display_legend)

        self.cc_log("DEBUG", "Heatmap created!")

        # Display labels if configured
        if self.display_labels: 
            props = dict(boxstyle='round', facecolor='linen', alpha=0.7)

            threshold = 0
            if self.labels_threshold: threshold = int(self.labels_threshold)
            for point in gp_map.iterrows():
                if point[1]['grouped_value'] > threshold: # Check threshold if configured
                    ax.text(point[1]['centroid'].x,point[1]['centroid'].y,point[1]['grouped_value'],horizontalalignment='center',fontsize=7,bbox=props)
            ax.axis('off')

        if self.title:
            plt.title(self.title)

        # Save the plot to the given target
        fig.savefig(self.target, bbox_inches='tight')
        plt.close('all')

        self.cc_log("INFO", 'Data Visualization Map: Finished Run Heatmap Success')
        return True
 def setUp(self):
     self.fr = json_file_reader(TESTDATA_SRC_FILENAME)