def getAttributes(self, src): """ Evaluates the given configuration and parses the attributes nessesary. **Parameters**: src : str The src location and name for opening the file. **Returns**: Returns a list of all wished attributes. """ if isinstance(self.exported_attributes, list): # just return the given list of attributes return self.exported_attributes json_fr = json_file_reader(src) if isinstance(self.exported_attributes, int): # find the line and it's attributes line = json_fr.readLineRecord(self.exported_attributes) attributes = self.getKeysFromDict(line) json_fr.close() return attributes if self.exported_attributes == 'all': # find all the attributes of the given file attributes = self.getAllKeysFromFile(json_fr) json_fr.close() return attributes else: raise ConfigurationError("Unknown attributes to export")
def run(self): """ Runs the join algorythm. """ self.cc_log("INFO", "Data Processing Join: Started") if self.left_joinon and isinstance(self.left_joinon, str): self.left_joinon = [self.left_joinon] if self.right_joinon and isinstance(self.right_joinon, str): self.right_joinon = [self.right_joinon] # Create the B-Tree for quick and easy search b_tree = genBTree(self.joinwith, self.left_joinon) json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # Loop through all the left table failed_counter = 0 while not json_fr.isEOF(): data = json_fr.readRecord() key = keyGen(self.right_joinon, data) (data, b_tree, failed_counter) = self.join(b_tree, key, data, failed_counter) json_fw.writeRecord(data) json_fr.close() json_fw.close() self.cc_log( "INFO", "%i (btree) & %i (keyerror) records could not be mached" % (len(b_tree), failed_counter)) self.cc_log("INFO", "Data Processing Join: Finished") return True
def run(self): """ Runs the clean algorythm. """ self.cc_log("INFO", "Data Processing Clean: Started") if self.format.lower() == "json": if self.drop and isinstance(self.drop, str): self.drop = [self.drop] if self.keep and isinstance(self.keep, str): self.keep = [self.keep] json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) self.cc_log("INFO", "Started to clean line for line, please wait!") while not json_fr.isEOF(): data = json_fr.readRecord() keepLine, cleaned_line = self.clean_json(data) self.cc_log("DEBUG", cleaned_line) if keepLine: json_fw.writeRecord(cleaned_line) json_fr.close() json_fw.close() else: raise NotImplementedError( "The defined format is not implement yet. Please add!") self.cc_log("INFO", "Data Processing Clean: Finished") return True
def get_data_from_files(self, files): """ Gets and extracts the data from the given fileslist. **Parameters**: files : list list of filepaths to process. **Returns**: ``file_count, names_list, data_dict`` amount of files, names list of the files, grouped data dict with the values scaled in case of missing data """ data_dict = {} names_list = [] file_count = 0 for file in files: json_fr = json_file_reader(file) while not json_fr.isEOF(): json_data = json_fr.readRecord() value = json_data for a in self.data_attribute.split('.'): value = value[a] # Threshold if self.threshold and int(value) < int(self.threshold): continue # Skip this line as its < threshold group_name = json_data for a in self.group_name_attribute.split('.'): group_name = group_name[a] if group_name in data_dict: data_dict[group_name].append(value) else: data_dict[group_name] = [0] * file_count data_dict[group_name].append(value) for gn in data_dict: if len( data_dict[gn] ) < file_count + 1: # Will not be appended as filecount isnt incremented yet, +1 added data_dict[gn].append(0) json_fr.close() # Add filenames list to names list or extract regex if defined name = None if self.filenames_regex_extract: name = re.search(self.filenames_regex_extract, os.path.basename(file)) if name: name = name.group(0) if not name: name = os.path.basename(file) names_list.append(name) file_count += 1 return file_count, names_list, data_dict
def test_get_all_keys_from_flie(self): """ Tests if all keys from the file are found. """ jfr = json_file_reader(TESTDATA_VALID_PATH) exp_val = ["Ship", "Ship.Captain", "Ship.Captain.Name"] act_val = self.ec.getAllKeysFromFile(jfr) self.assertEqual( act_val.sort(), exp_val.sort() ) # only the content must be the same, how it is arranged does not matter
def plot_histogram(self, files): """ Plots a histogram. **Parameters**: files : list list of file paths. **Returns**: ``True`` if the plot was successfully saved. ``False`` in case something failed. """ _, ax = plt.subplots() values_list = [] names_list = [] for file in files: json_fr = json_file_reader(file) values = [] while not json_fr.isEOF(): data = json_fr.readRecord() value = data for a in self.data_attribute.split('.'): value = value[a] # Threshold if self.threshold and int(value) < int(self.threshold): continue # Skip this line as its < threshold values.append(value) json_fr.close() values_list.append(values) names_list.append(os.path.basename(file)) self.set_color_cycle(len(names_list), ax) ax.hist(values_list, label=names_list, bins=10, edgecolor='white') ax.set_ylabel(self.y_label, fontweight='bold') ax.set_xlabel(self.x_label, fontweight='bold') ax.set_title(self.title, fontweight='bold') if self.show_legend: ax.legend(loc='best') if self.show_grid: plt.grid(linestyle='dotted') plt.savefig(self.target, bbox_inches='tight') plt.close('all') return True
def run(self): """ Runs the classing algorythm. """ self.cc_log("INFO", "Data Processing Classing: Started") json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) while not json_fr.isEOF(): record = json_fr.readRecord() classes = self.getClasses(record) record['classes'] = classes json_fw.writeRecord(record) json_fr.close() json_fw.close() self.cc_log("INFO", "Data Processing Classing: Finished") return True
def run(self): """ Runs the group algorythm. """ self.cc_log("INFO", "Data Processing Group: Started") data_dict = {} json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # load data self.cc_log("DEBUG", "Started to group, please wait...!") while not json_fr.isEOF(): data = json_fr.readRecord() for attribute in self.groupBy.split('.'): data = data.get(attribute, {}) if not data: self.cc_log("DEBUG", "Skip a line, attribute was not found!") continue # Skip as attribute seems to not be found # check if the groupRegex is set and get the first group of it if self.groupRegex: data = re.search(self.groupRegex, data) if not data or not data.group(0): data = "others" else: data = data.group(0) self.cc_log("DEBUG", "Regex grouped %s" % data) if data in data_dict: data_dict[data] += 1 else: data_dict[data] = 1 for entry in self.dictToList(data_dict): json_fw.writeRecord(entry) json_fr.close() json_fw.close() self.cc_log( "INFO", "Data Processing Group: Aggregated the data set into " + str(len(data_dict.keys())) + " data entries") self.cc_log("INFO", "Data Processing Group: Finished") return True
def genBTree(src, attributes): """ Generates a B-Tree from the given source file and uses the attributes to generate a key. **Parameters**: src : str the path and file name to the file. attributes : list the list of attributes which define the key **Returns**: A complete B-Tree. """ json_fr = json_file_reader(src) b_tree = OOBTree() while not json_fr.isEOF(): data = json_fr.readRecord() key = keyGen(attributes, data) if not key: continue # Key was not generated, go to next b_tree.insert(key, data) return b_tree
def run(self): """ Runs the csv export algorythm. """ attributes = self.getAttributes(self.src) json_fr = json_file_reader(self.src) csv_fw = csv_file_writer(self.target, attributes) while not json_fr.isEOF(): line = json_fr.readRecord() csv_row = {} # flatten the dict so that it can be written into the CSV format for key in attributes: val = self.getValueFromDict(line, key) if not val: val = self.attribute_fill csv_row[key] = val csv_fw.writeCSVRow(csv_row) json_fr.close() csv_fw.close()
def run(self): """ Runs the filter algorythm. """ self.cc_log("INFO", "Data Processing Filter: Started") count = 0 json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) # load data self.cc_log("DEBUG", "Started to filter, please wait...!") while not json_fr.isEOF(): data = json_fr.readRecord() if self.filter(data): json_fw.writeRecord(data) else: count += 1 json_fr.close() json_fw.close() self.cc_log( "INFO", "Data Processing Filter: Filtered " + str(count) + " data sets") self.cc_log("INFO", "Data Processing Filter: Finished") return True
def run(self): """ Runs the diff algorythm. **Returns**: ``True`` if the run works fine. """ self.cc_log("INFO", "Data Processing Diff: Started") if self.attributes_diff and isinstance(self.attributes_diff, str): self.attributes_diff = [self.attributes_diff] if self.key_attributes and isinstance(self.key_attributes, str): self.key_attributes = [self.key_attributes] # if the target does not exist create the file and add all the data if not path.isfile(self.target): json_fr = json_file_reader(self.src) self.cc_log("DEBUG", "Opened source file") json_fw = json_file_writer(self.target) self.cc_log("DEBUG", "Opened target file - please have patience") while not json_fr.isEOF(): data = json_fr.readRecord() data = self.genDataSet(keyGen(self.key_attributes, data), data, self.attributes_diff) json_fw.writeRecord(data) json_fr.close() json_fw.close() # else create a B-Tree out of the src file with the nessecary data else: self.cc_log( "DEBUG", "Generating B-Tree for the diff - please have patience") b_tree = genBTree(self.src, self.key_attributes) # move the old target so it can be read from and does not collide with the writer old_target = self.target + '.old' move(self.target, old_target) json_fr = json_file_reader(old_target) json_fw = json_file_writer(self.target) self.cc_log("INFO", "Started to generate the diff - please have patience") while not json_fr.isEOF(): old_data = json_fr.readRecord() try: # update all the data new_data = b_tree.pop(old_data["cc_id"]) diff_data = self.getDataByAttributes( self.attributes_diff, new_data) old_data = self.compareData(old_data, diff_data) except KeyError: # if the id cannot be found it must be delete old_data["cc_status"] = "delete" old_data["cc_time_id"] = self.time_id json_fw.writeRecord(old_data) # add the left over data self.cc_log("INFO", "Adding leftover data...") while b_tree: key = b_tree.minKey() data = self.genDataSet(key, b_tree.pop(key), self.attributes_diff) json_fw.writeRecord(data) remove(old_target) json_fr.close() json_fw.close() self.kv_store.put(key="diff_last_src", value=(self.time_id), section=self.moduleName, force=True) self.cc_log("INFO", "Data Processing Diff: Finished") return True
def run(self): """ Runs the clean algorythm. **Returns**: ``True`` if this run succeeded. ``False`` if this run did not succeed. """ self.cc_log("INFO", "Data Processing Country: Started") self.cc_log( "DEBUG", "Trying to open the MaxMind GeoLite2-Country DB, please wait!") try: db = geoip2.database.Reader(self.max_mind_db_path) except Exception as e: self.logger.exception(e) self.cc_log( "ERROR", "Failed to open the MaxMind GeoLite2-Country DB at %s - please check the file!" % (self.max_mind_db_path)) return False self.cc_log("DEBUG", "Opened the MaxMindGeoLite2-Country DB!") json_fr = json_file_reader(self.src) json_fw = json_file_writer(self.target) self.cc_log( "INFO", "Started to lookup ips and write into the target, please wait!") while not json_fr.isEOF(): data = json_fr.readRecord() country_code = "-99" found_ip = data for attribute in self.ip_input_attribute.split('.'): found_ip = found_ip[attribute] if not found_ip or found_ip == data: self.cc_log( "WARNING", "No IP found at the give ipInputAttribute place - Add country code -99 to this dataset!" ) else: # Lookup ip for country try: ip_info = db.country(found_ip) if ip_info.country.iso_code: country_code = ip_info.country.iso_code self.cc_log( "DEBUG", "Found country code %s for ip %s" % (ip_info.country.iso_code, found_ip)) except Exception as e: self.cc_log( "WARNING", "No country code found for ip %s - add -99 to country code" % (found_ip)) data[self.output_attribute] = country_code json_fw.writeRecord(data) json_fr.close() json_fw.close() db.close() self.cc_log("INFO", "Data Processing Country: Finished") return True
def plot_heatmap(self): """ Plot a heatmap with the given defined country and grouped value attribute. **Parameters**: attribute : str Keyname for the wanted geojson map. **Returns**: ``True`` if the target with the plot was successfully written. ``False`` if the plot was not written to the target and it failed. """ plt.rcParams['figure.figsize'] = (20, 10) self.cc_log("DEBUG", "Trying to read %s gejson file" % self.geojson_map) gp_map = gpd.read_file(self.geojson_map) gp_map['grouped_value'] = 0 # Init all shapes on the map with a grouped_value of 0 gp_map['centroid'] = gp_map['geometry'].centroid # Set the center value on all shapes for labels # Loop through source file and read the given country code & grouped value attributes and set them on the map self.cc_log("DEBUG", "Trying to read %s src file" % self.src) json_fr = json_file_reader(self.src) self.cc_log("DEBUG", "Creating the heatmap...") while not json_fr.isEOF(): data = json_fr.readRecord() country_code = data for a in self.country_code_attribute.split('.'): country_code = country_code[a] grouped_value = data for a in self.grouped_value_attribute.split('.'): grouped_value = grouped_value[a] # Geopandas method to set the grouped value to where the given country code matches # Check country code if ISO_A2 or ISO_A3 or undefined (-99) or something else country_code = str(country_code) if country_code == "-99" or country_code == "null" or country_code == "None": self.cc_log("WARNING", "There is an undefined country code, we skip this dataset - Please recheck to have an accurate plot!") continue if len(country_code) > 3 or len(country_code) < 2: self.cc_log("ERROR", "The given country code (%s) has a length of %s which is not a valid iso3166_A3 or iso3166_A2 code - Please recheck!" % (country_code, len(country_code))) return False if len(country_code) == 2: # ISO_A2 Code - Try to convert country_code_old = country_code if country_code not in self.alpha2_country_codes: self.cc_log("WARNING", "There given iso3166 alpha2 code (%s) does not match any alpha3 code, we skip this dataset - Please recheck to have an accurate plot!" % (country_code)) continue country_code = self.alpha2_country_codes[country_code].alpha3 self.cc_log("DEBUG", "Converted alpha2 country code '%s' to alpha3 code '%s'" % (country_code_old, country_code)) gp_map.loc[gp_map['ISO_A3'] == country_code, 'grouped_value'] = int(grouped_value) json_fr.close() # Plot the map fig, ax = plt.subplots(1) gp_map.plot(ax=ax, column='grouped_value', cmap=self.colormap, edgecolor='black', linewidth=0.2, legend=self.display_legend) self.cc_log("DEBUG", "Heatmap created!") # Display labels if configured if self.display_labels: props = dict(boxstyle='round', facecolor='linen', alpha=0.7) threshold = 0 if self.labels_threshold: threshold = int(self.labels_threshold) for point in gp_map.iterrows(): if point[1]['grouped_value'] > threshold: # Check threshold if configured ax.text(point[1]['centroid'].x,point[1]['centroid'].y,point[1]['grouped_value'],horizontalalignment='center',fontsize=7,bbox=props) ax.axis('off') if self.title: plt.title(self.title) # Save the plot to the given target fig.savefig(self.target, bbox_inches='tight') plt.close('all') self.cc_log("INFO", 'Data Visualization Map: Finished Run Heatmap Success') return True
def setUp(self): self.fr = json_file_reader(TESTDATA_SRC_FILENAME)