def main(): #Basic Setup print("Starting downlaod module from command line") config_data = ut.get_config_data(sys.argv[1]) print("Configuration file loaded") ut.sanity_cahecks(config_data) _read_config(config_data) _download_files()
def launch_transformations(): log.info('Checking transformation crontabs') transformations = session.query(Transformation).all() for t in transformations: if must_run(t.minute, t.hour, t.day_of_week): mainclass, _ = get_config_data(t.output_dir) celery.send_task("extractor.perform_extraction", args=[t.package_id, mainclass], task_id=str(uuid.uuid4()))
def main(): print("Loading LU-BW sensorts data") config_data = ut.get_config_data(sys.argv[1]) print("Configuration file loaded") ut.sanity_cahecks(config_data) print("Sanity checks performed. Everything is ok.") _read_config(config_data) print("Relavent Configuration data has been loaded") global data_files_dir df_end = pd.DataFrame() date_range = pd.date_range(start=period[1], end=period[1] , freq=freq) date_range = date_range[0:date_range.shape[0] - 1] df_end = df_end.reindex(date_range) ignored_sens = np.array([]) for f in ignored_sensors_files: with open(os.path.expanduser(f)) as sens_file: sens = sens_file.read().split('\n') ignored_sens = np.append(ignored_sens,sens) data_files = [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))] count = 0 for f in data_files: res = re.search('end_data_frame_(\S+)\.csv', f, re.IGNORECASE) id = res.group(1) if str(id) not in ignored_sens: df = pd.read_csv(f,sep=';',parse_dates=True, index_col="timestamp") df_end = pd.concat([df_end , df], axis=1) lu_bw_folder = os.path.join(folder, "lu_bw") print(lu_bw_folder) data_files = [os.path.join(lu_bw_folder, f) for f in os.listdir(lu_bw_folder) if os.path.isfile(os.path.join(lu_bw_folder, f))] for f in data_files: print(f) res = re.search('(\S+)\.csv', f, re.IGNORECASE) id = res.group(1) print("Loading " + id + " from lu bw") df = pd.read_csv(f,sep=';',parse_dates=True, index_col="timestamp") df_end = pd.concat([df_end, df], axis=1) print("Final shape of the DF: " + str(df_end.shape)) df_end.to_csv(folder + "/" + final_df_name, sep=";",index_label="timestamp")
def setup_logger(log_file="apitest.log"): logger = logging.getLogger("apitest") formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') log_level = utils.get_config_data('logging', 'level').upper() if log_file: handler = logging.FileHandler(log_file, encoding="utf-8") else: handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(log_level) return logger
def deploy_transformation(self, transformation): mainclass, required = get_config_data(transformation.output_dir) transformation_instance = get_instance(transformation.output_dir, mainclass) transformation_instance.deploy() #install depedencies using celery celery.send_task("extractor.install_dependencies", args=[required], task_id=str(uuid.uuid4())) #remove extraction log transformation.extractions = [] model.Session.merge(transformation) model.Session.commit()
def launch_transformation(self, id): log.info('Launching transformation for package name: %s' % id) # using default functionality self.read(id) t = model.Session.query(Transformation).filter_by(package_id=c.pkg.id).first() mainclass, _ = get_config_data(t.output_dir) celery.send_task("extractor.perform_extraction", args=[t.package_id, mainclass], task_id=str(uuid.uuid4())) self.get_transformation_data(c.pkg.id, c) c.error = False #rendering using template return render('extractor/read.html')
def main(): print("Starting the proprocess module from the command line") config_data = ut.get_config_data(sys.argv[1]) print("Configuration file loaded") ut.sanity_cahecks(config_data) print("Sanity checks performed. Everything is ok.") _read_config(config_data) print("Relavent Configuration data has been loaded") df_before_p1 = pd.DataFrame(columns=[ "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count" ]) df_before_p2 = pd.DataFrame(columns=[ "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count" ]) df_after_p1 = pd.DataFrame(columns=[ "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count" ]) df_after_p2 = pd.DataFrame(columns=[ "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count" ]) missig_data_sensors = np.loadtxt(bad_missing_data_sensors, dtype='str', ndmin=1) print("Sensors with missing data " + str(len(missig_data_sensors)) + ": " + str(missig_data_sensors)) data_files = [ os.path.join(data_files_dir, f) for f in os.listdir(data_files_dir) if os.path.isfile(os.path.join(data_files_dir, f)) ] size = len(data_files) reindexed_frames = np.array([]) too_far_sens = np.array([]) for indx, f in enumerate(data_files): id = str( re.search('end_data_frame_(\d+)\.csv', f, re.IGNORECASE).group(1)) if id in missig_data_sensors: print("Skipping" + id + "; " + str(indx) + "/" + str(size)) continue print("Processing " + id + "; " + str(indx) + "/" + str(size)) df = pd.read_csv(f, sep=';', parse_dates=True, index_col="timestamp") desc_file = open(description_files_dir + "/" + id + ".txt", "w") # Save those to file desc_file.write( "Description before missing data correction\n-----------------------\n" ) desc_file.write(str(df.describe()) + "\n\n") desc_file.write(str(df.corr()) + "\n\n") df_before_p1 = df_before_p1.append(generate_info_dict( id, "P1_" + id, df), ignore_index=True) df_before_p2 = df_before_p2.append(generate_info_dict( id, "P2_" + id, df), ignore_index=True) date_range = pd.date_range(start=reindex_period[0], end=reindex_period[1], freq=reindex_freq) date_range = date_range[0:date_range.shape[0] - 1] if df.shape[0] < date_range.shape[0]: if date_range.shape[0] - df.shape[0] > missing_entries_threshold: reindexed_frames = np.append(reindexed_frames, id) print("Too bad sensor - " + id) df = df[~df.index.duplicated()] df = df.reindex(date_range) if df.shape[0] > date_range.shape[0]: dup_group = df.groupby(df.index, as_index=True, sort=True) df = dup_group.mean() df = df.reindex(date_range) df.fillna(df.mean(), inplace=True) desc_file.write( "Description after missing data correction\n-----------------------\n" ) desc_file.write(str(df.describe()) + "\n\n") desc_file.write(str(df.corr()) + "\n\n") df_after_p1 = df_before_p1.append(generate_info_dict( id, "P1_" + id, df), ignore_index=True) df_after_p2 = df_before_p2.append(generate_info_dict( id, "P2_" + id, df), ignore_index=True) if generate_plots: plt.cla() plt.close() plt.clf() df.plot(linewidth=1.0, style=['r-', 'b--'], grid=True, figsize=(13, 11), title="P1 and P2 values of " + id) plt.savefig(os.path.join(description_files_dir, "plots/" + id + "_plot.png"), bbox_inches='tight') plt.cla() plt.close() plt.clf() df.rolling(100).mean().plot( linewidth=1.0, style=['r-', 'b--'], grid=True, figsize=(13, 11), title="Rolling averages over P1 and P2 of " + id) plt.savefig(os.path.join(description_files_dir, "plots/" + id + "_rolling_plot_.png"), bbox_inches='tight') df.to_csv(f, sep=";", index_label=time_column) desc_file.close() np.savetxt(str(reindexed_frames_file), reindexed_frames, fmt='%s') np.savetxt(str(reindexed_frames_file), reindexed_frames, fmt='%s') df_before_p1.to_csv(env_dir + "/description_frame_p1_before.csv", sep=";") df_before_p2.to_csv(env_dir + "/description_frame_p2_before.csv", sep=";") df_after_p1.to_csv(env_dir + "/description_frame_p1_after.csv", sep=";") df_after_p2.to_csv(env_dir + "/description_frame_p2_after.csv", sep=";")
def _main(): print("Loading LU-BW sensorts data") config_data = ut.get_config_data(sys.argv[1]) print("Configuration file loaded") ut.sanity_cahecks(config_data) print("Sanity checks performed. Everything is ok.") _read_config(config_data) print("Relavent Configuration data has been loaded") file_name = sys.argv[2] print("Loading from " + file_name) frames = pd.ExcelFile(file_name, index_col="timestamp", parse_dates=True) print(frames.sheet_names) for name in frames.sheet_names: df = frames.parse(name, index_col="timestamp", parse_dates=True) df["P1"].replace(to_replace=-999.0, value=np.NaN, inplace=True) df["P2"].replace(to_replace=-999.0, value=np.NaN, inplace=True) df = df.sort_index() day = df.groupby(pd.Grouper(freq=integration_freq)) if integration_type == "MEAN": df = day.mean() elif integration_type == "MEADIAN": df = day.median() elif integration_type == "MIN": df = day.min() elif integration_type == "MAX": df = day.max() date_range = pd.date_range(start=reindex_period[0], end=reindex_period[1], freq=integration_freq) date_range = date_range[0:date_range.shape[0] - 1] df = df.reindex(date_range) df["P1"] = pd.to_numeric(df["P1"]) df["P2"] = pd.to_numeric(df["P2"]) id = name.split(" ")[0] rename_dict = {} rename_dict["P1"] = "P1_" + str(id) rename_dict["P2"] = "P2_" + str(id) df.rename(index=str, columns=rename_dict, inplace=True) desc_file = open(description_files_dir + "/" + id + ".txt", "w") desc_file.write( "Description before missing data correction\n-----------------------\n" ) desc_file.write(str(df.describe()) + "\n\n") desc_file.write(str(df.corr()) + "\n\n") df.interpolate(method="linear", inplace=True) df.fillna(df.mean(), inplace=True) plt.cla() plt.close() plt.clf() df.plot(linewidth=1.0, style=['r-', 'b--'], grid=True, figsize=(13, 11), title="P1 and P2 values of " + id) plt.xlabel('Time') plt.ylabel('Value') plt.savefig(os.path.join(description_files_dir, "plots/" + id + "_plot_P1_P2.png"), bbox_inches='tight') plt.cla() plt.close() plt.clf() df.rolling(100).mean().plot( linewidth=1.0, style=['r-', 'b--'], grid=True, figsize=(13, 11), title="Rolling averages over P1 and P2 of " + id) plt.savefig(os.path.join(description_files_dir, "plots/" + id + "_rolling_avg.png"), bbox_inches='tight') desc_file.write( "Description after missing data correction\n-----------------------\n" ) desc_file.write(str(df.describe()) + "\n\n") desc_file.write(str(df.corr()) + "\n\n") df.to_csv(os.path.join(lu_bw_data_files_dir, name.split(" ")[0] + ".csv"), sep=";", index_label="timestamp")
def _main(): print("Starting the proprocess module from the command line") config_data = ut.get_config_data(sys.argv[1]) print("Configuration file loaded") ut.sanity_cahecks(config_data) print("Sanity checks performed. Everything is ok.") _read_config(config_data) print("Relavent Configuration data has been loaded") raw_files = None if "--read-good-files-from-list" in sys.argv: print("Files to be processed are read from: " + str(good_sensors_data_files_list)) raw_files_np = np.loadtxt(good_sensors_data_files_list, dtype='str', ndmin=1) raw_files = raw_files_np.tolist() else: print("Reading all raw files.") raw_files = [ os.path.join(raw_files_dir, f) for f in os.listdir(raw_files_dir) if os.path.isfile(os.path.join(raw_files_dir, f)) ] print( str(len(raw_files)) + " raw files found in the raw_files directory.(" + raw_files_dir + ")") if "--filter-raw-files" in sys.argv: good_sensors = None if "--check-day-for-sensors" in sys.argv: print("Processing the files and finding the good sensors") good_sensors = _check_days_for_sensors(raw_files) else: print("Reading good sensors from: " + str(good_sensors_list_file)) good_sensors = open(good_sensors_list_file, "r").read().split('\n') #perform location check def location_check(s_id): s_id = str( re.search('sensor_(\d+)\.csv', s_id, re.IGNORECASE).group(1)) (lat, lon) = ut.sensor_coord(s_id) return ut.distanceInKmBetweenEarthCoordinates( center[0], center[1], lat, lon) < radius good_sensors = list(filter(location_check, good_sensors)) print("Sensors that pass all the checks: ") print(good_sensors) raw_files = list( filter( lambda f: str( re.search('sensor_(\d+)\.csv', f, re.IGNORECASE).group( 0)) in good_sensors, raw_files)) if "--save-raw-files-list" in sys.argv: raw_files_np = np.array(raw_files) np.savetxt(str(good_sensors_data_files_list), raw_files_np, fmt='%s') if "--preprocess-files" in sys.argv: size = len(raw_files) print("Processing " + str(size) + " files") size = len(raw_files) for indx, f in enumerate(raw_files): _process_file(f) if indx % 25 == 0: print(str(indx) + "/" + str(size)) print("Done processing files") print("Sensors with missing data above the threshold: " + str(bad_data_sensors)) np.savetxt(bad_missing_data_sensors, bad_data_sensors, fmt='%s') if "--sort-end-frames" in sys.argv: print("Sorting data frames according to date") data_files = [ os.path.join(files_dir, f) for f in os.listdir(files_dir) if os.path.isfile(os.path.join(files_dir, f)) ] size = len(data_files) for ind, f in enumerate(data_files): df = pd.read_csv(f, sep=';', parse_dates=True, index_col="timestamp") df.sort_index(ascending=True, inplace=True) df.to_csv( f, sep=";", ) del (data_files)