Beispiel #1
0
def main():
    #Basic Setup
    print("Starting downlaod module from command line")
    config_data = ut.get_config_data(sys.argv[1]) 
    print("Configuration file loaded")
    ut.sanity_cahecks(config_data)
    _read_config(config_data)
    _download_files()
Beispiel #2
0
def launch_transformations():    
    log.info('Checking transformation crontabs')
    transformations = session.query(Transformation).all()

    for t in transformations:
        if must_run(t.minute, t.hour, t.day_of_week):
            mainclass, _ = get_config_data(t.output_dir)
            celery.send_task("extractor.perform_extraction",
                args=[t.package_id, mainclass], task_id=str(uuid.uuid4()))
Beispiel #3
0
def main():
    print("Loading LU-BW sensorts data")
    config_data = ut.get_config_data(sys.argv[1]) 
    print("Configuration file loaded")
    ut.sanity_cahecks(config_data)
    print("Sanity checks performed. Everything is ok.")
    _read_config(config_data)
    print("Relavent Configuration data has been loaded")

    global data_files_dir
    
    

    df_end = pd.DataFrame()
    date_range = pd.date_range(start=period[1], end=period[1] , freq=freq)
    date_range = date_range[0:date_range.shape[0] - 1]
    df_end = df_end.reindex(date_range)

    ignored_sens = np.array([])
    
    for f in ignored_sensors_files:
        with open(os.path.expanduser(f)) as sens_file:
            sens = sens_file.read().split('\n')
            ignored_sens = np.append(ignored_sens,sens)        

    data_files = [os.path.join(folder, f) for f in os.listdir(folder)
                  if os.path.isfile(os.path.join(folder, f))]




    count = 0
    for f in data_files:
        res = re.search('end_data_frame_(\S+)\.csv', f, re.IGNORECASE)
        id = res.group(1)
        if str(id) not in ignored_sens:
            df = pd.read_csv(f,sep=';',parse_dates=True, index_col="timestamp")
            df_end = pd.concat([df_end , df], axis=1)                



    lu_bw_folder = os.path.join(folder, "lu_bw")
    print(lu_bw_folder)
    data_files = [os.path.join(lu_bw_folder, f) for f in os.listdir(lu_bw_folder)
                  if os.path.isfile(os.path.join(lu_bw_folder, f))]

    for f in data_files:
        print(f)
        res = re.search('(\S+)\.csv', f, re.IGNORECASE)
        id = res.group(1)
        print("Loading " + id + " from lu bw")
        df = pd.read_csv(f,sep=';',parse_dates=True, index_col="timestamp")
        df_end = pd.concat([df_end, df], axis=1)


    print("Final shape of the DF: " + str(df_end.shape))
    df_end.to_csv(folder + "/" + final_df_name, sep=";",index_label="timestamp")
Beispiel #4
0
def setup_logger(log_file="apitest.log"):
    logger = logging.getLogger("apitest")
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_level = utils.get_config_data('logging', 'level').upper()
    if log_file:
        handler = logging.FileHandler(log_file, encoding="utf-8")
    else:
        handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(log_level)
    return logger
    def deploy_transformation(self, transformation):
        mainclass, required = get_config_data(transformation.output_dir)
        transformation_instance = get_instance(transformation.output_dir, mainclass)
        transformation_instance.deploy()

        #install depedencies using celery
        celery.send_task("extractor.install_dependencies",
            args=[required], task_id=str(uuid.uuid4()))

        #remove extraction log
        transformation.extractions = []
        model.Session.merge(transformation)
        model.Session.commit()
    def launch_transformation(self, id):
        log.info('Launching transformation for package name: %s' % id)

        # using default functionality
        self.read(id)

        t = model.Session.query(Transformation).filter_by(package_id=c.pkg.id).first()

        mainclass, _ = get_config_data(t.output_dir)
        celery.send_task("extractor.perform_extraction",
            args=[t.package_id, mainclass], task_id=str(uuid.uuid4()))

        self.get_transformation_data(c.pkg.id, c)
        c.error = False

        #rendering using template
        return render('extractor/read.html')
def main():
    print("Starting the proprocess module from the command line")
    config_data = ut.get_config_data(sys.argv[1])
    print("Configuration file loaded")
    ut.sanity_cahecks(config_data)
    print("Sanity checks performed. Everything is ok.")
    _read_config(config_data)
    print("Relavent Configuration data has been loaded")

    df_before_p1 = pd.DataFrame(columns=[
        "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count"
    ])
    df_before_p2 = pd.DataFrame(columns=[
        "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count"
    ])
    df_after_p1 = pd.DataFrame(columns=[
        "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count"
    ])
    df_after_p2 = pd.DataFrame(columns=[
        "id", "min", "max", "mean", "var", "std", "skew", "kurt", "count"
    ])

    missig_data_sensors = np.loadtxt(bad_missing_data_sensors,
                                     dtype='str',
                                     ndmin=1)
    print("Sensors with missing data " + str(len(missig_data_sensors)) + ": " +
          str(missig_data_sensors))

    data_files = [
        os.path.join(data_files_dir, f) for f in os.listdir(data_files_dir)
        if os.path.isfile(os.path.join(data_files_dir, f))
    ]
    size = len(data_files)
    reindexed_frames = np.array([])
    too_far_sens = np.array([])

    for indx, f in enumerate(data_files):
        id = str(
            re.search('end_data_frame_(\d+)\.csv', f, re.IGNORECASE).group(1))
        if id in missig_data_sensors:
            print("Skipping" + id + "; " + str(indx) + "/" + str(size))
            continue
        print("Processing " + id + "; " + str(indx) + "/" + str(size))
        df = pd.read_csv(f, sep=';', parse_dates=True, index_col="timestamp")

        desc_file = open(description_files_dir + "/" + id + ".txt", "w")
        # Save those to file
        desc_file.write(
            "Description before missing data correction\n-----------------------\n"
        )
        desc_file.write(str(df.describe()) + "\n\n")
        desc_file.write(str(df.corr()) + "\n\n")

        df_before_p1 = df_before_p1.append(generate_info_dict(
            id, "P1_" + id, df),
                                           ignore_index=True)
        df_before_p2 = df_before_p2.append(generate_info_dict(
            id, "P2_" + id, df),
                                           ignore_index=True)

        date_range = pd.date_range(start=reindex_period[0],
                                   end=reindex_period[1],
                                   freq=reindex_freq)
        date_range = date_range[0:date_range.shape[0] - 1]

        if df.shape[0] < date_range.shape[0]:
            if date_range.shape[0] - df.shape[0] > missing_entries_threshold:
                reindexed_frames = np.append(reindexed_frames, id)
                print("Too bad sensor - " + id)
            df = df[~df.index.duplicated()]
            df = df.reindex(date_range)

        if df.shape[0] > date_range.shape[0]:
            dup_group = df.groupby(df.index, as_index=True, sort=True)
            df = dup_group.mean()
            df = df.reindex(date_range)

        df.fillna(df.mean(), inplace=True)

        desc_file.write(
            "Description after missing data correction\n-----------------------\n"
        )
        desc_file.write(str(df.describe()) + "\n\n")
        desc_file.write(str(df.corr()) + "\n\n")

        df_after_p1 = df_before_p1.append(generate_info_dict(
            id, "P1_" + id, df),
                                          ignore_index=True)
        df_after_p2 = df_before_p2.append(generate_info_dict(
            id, "P2_" + id, df),
                                          ignore_index=True)

        if generate_plots:

            plt.cla()
            plt.close()
            plt.clf()
            df.plot(linewidth=1.0,
                    style=['r-', 'b--'],
                    grid=True,
                    figsize=(13, 11),
                    title="P1 and P2 values of " + id)
            plt.savefig(os.path.join(description_files_dir,
                                     "plots/" + id + "_plot.png"),
                        bbox_inches='tight')

            plt.cla()
            plt.close()
            plt.clf()
            df.rolling(100).mean().plot(
                linewidth=1.0,
                style=['r-', 'b--'],
                grid=True,
                figsize=(13, 11),
                title="Rolling averages over P1 and P2 of " + id)
            plt.savefig(os.path.join(description_files_dir,
                                     "plots/" + id + "_rolling_plot_.png"),
                        bbox_inches='tight')

        df.to_csv(f, sep=";", index_label=time_column)
        desc_file.close()
    np.savetxt(str(reindexed_frames_file), reindexed_frames, fmt='%s')
    np.savetxt(str(reindexed_frames_file), reindexed_frames, fmt='%s')

    df_before_p1.to_csv(env_dir + "/description_frame_p1_before.csv", sep=";")
    df_before_p2.to_csv(env_dir + "/description_frame_p2_before.csv", sep=";")

    df_after_p1.to_csv(env_dir + "/description_frame_p1_after.csv", sep=";")
    df_after_p2.to_csv(env_dir + "/description_frame_p2_after.csv", sep=";")
Beispiel #8
0
def _main():
    print("Loading LU-BW sensorts data")
    config_data = ut.get_config_data(sys.argv[1])
    print("Configuration file loaded")
    ut.sanity_cahecks(config_data)
    print("Sanity checks performed. Everything is ok.")
    _read_config(config_data)
    print("Relavent Configuration data has been loaded")

    file_name = sys.argv[2]
    print("Loading from " + file_name)
    frames = pd.ExcelFile(file_name, index_col="timestamp", parse_dates=True)
    print(frames.sheet_names)
    for name in frames.sheet_names:
        df = frames.parse(name, index_col="timestamp", parse_dates=True)

        df["P1"].replace(to_replace=-999.0, value=np.NaN, inplace=True)
        df["P2"].replace(to_replace=-999.0, value=np.NaN, inplace=True)

        df = df.sort_index()
        day = df.groupby(pd.Grouper(freq=integration_freq))
        if integration_type == "MEAN":
            df = day.mean()
        elif integration_type == "MEADIAN":
            df = day.median()
        elif integration_type == "MIN":
            df = day.min()
        elif integration_type == "MAX":
            df = day.max()

        date_range = pd.date_range(start=reindex_period[0],
                                   end=reindex_period[1],
                                   freq=integration_freq)
        date_range = date_range[0:date_range.shape[0] - 1]
        df = df.reindex(date_range)

        df["P1"] = pd.to_numeric(df["P1"])
        df["P2"] = pd.to_numeric(df["P2"])
        id = name.split(" ")[0]
        rename_dict = {}
        rename_dict["P1"] = "P1_" + str(id)
        rename_dict["P2"] = "P2_" + str(id)
        df.rename(index=str, columns=rename_dict, inplace=True)

        desc_file = open(description_files_dir + "/" + id + ".txt", "w")
        desc_file.write(
            "Description before missing data correction\n-----------------------\n"
        )
        desc_file.write(str(df.describe()) + "\n\n")
        desc_file.write(str(df.corr()) + "\n\n")

        df.interpolate(method="linear", inplace=True)
        df.fillna(df.mean(), inplace=True)

        plt.cla()
        plt.close()
        plt.clf()
        df.plot(linewidth=1.0,
                style=['r-', 'b--'],
                grid=True,
                figsize=(13, 11),
                title="P1 and P2 values of " + id)
        plt.xlabel('Time')
        plt.ylabel('Value')
        plt.savefig(os.path.join(description_files_dir,
                                 "plots/" + id + "_plot_P1_P2.png"),
                    bbox_inches='tight')

        plt.cla()
        plt.close()
        plt.clf()
        df.rolling(100).mean().plot(
            linewidth=1.0,
            style=['r-', 'b--'],
            grid=True,
            figsize=(13, 11),
            title="Rolling averages over P1 and P2 of " + id)
        plt.savefig(os.path.join(description_files_dir,
                                 "plots/" + id + "_rolling_avg.png"),
                    bbox_inches='tight')

        desc_file.write(
            "Description after missing data correction\n-----------------------\n"
        )
        desc_file.write(str(df.describe()) + "\n\n")
        desc_file.write(str(df.corr()) + "\n\n")

        df.to_csv(os.path.join(lu_bw_data_files_dir,
                               name.split(" ")[0] + ".csv"),
                  sep=";",
                  index_label="timestamp")
def _main():

    print("Starting the proprocess module from the command line")
    config_data = ut.get_config_data(sys.argv[1])
    print("Configuration file loaded")
    ut.sanity_cahecks(config_data)
    print("Sanity checks performed. Everything is ok.")
    _read_config(config_data)
    print("Relavent Configuration data has been loaded")

    raw_files = None
    if "--read-good-files-from-list" in sys.argv:
        print("Files to be processed are read from: " +
              str(good_sensors_data_files_list))
        raw_files_np = np.loadtxt(good_sensors_data_files_list,
                                  dtype='str',
                                  ndmin=1)
        raw_files = raw_files_np.tolist()
    else:
        print("Reading all raw files.")
        raw_files = [
            os.path.join(raw_files_dir, f) for f in os.listdir(raw_files_dir)
            if os.path.isfile(os.path.join(raw_files_dir, f))
        ]
        print(
            str(len(raw_files)) +
            " raw files found in the raw_files directory.(" + raw_files_dir +
            ")")

        if "--filter-raw-files" in sys.argv:
            good_sensors = None
            if "--check-day-for-sensors" in sys.argv:
                print("Processing the files and finding the good sensors")
                good_sensors = _check_days_for_sensors(raw_files)
            else:
                print("Reading good sensors from: " +
                      str(good_sensors_list_file))
                good_sensors = open(good_sensors_list_file,
                                    "r").read().split('\n')

            #perform location check
            def location_check(s_id):
                s_id = str(
                    re.search('sensor_(\d+)\.csv', s_id,
                              re.IGNORECASE).group(1))
                (lat, lon) = ut.sensor_coord(s_id)
                return ut.distanceInKmBetweenEarthCoordinates(
                    center[0], center[1], lat, lon) < radius

            good_sensors = list(filter(location_check, good_sensors))
            print("Sensors that pass all the checks: ")
            print(good_sensors)

            raw_files = list(
                filter(
                    lambda f: str(
                        re.search('sensor_(\d+)\.csv', f, re.IGNORECASE).group(
                            0)) in good_sensors, raw_files))

        if "--save-raw-files-list" in sys.argv:
            raw_files_np = np.array(raw_files)
            np.savetxt(str(good_sensors_data_files_list),
                       raw_files_np,
                       fmt='%s')

    if "--preprocess-files" in sys.argv:
        size = len(raw_files)
        print("Processing " + str(size) + " files")
        size = len(raw_files)
        for indx, f in enumerate(raw_files):
            _process_file(f)
            if indx % 25 == 0:
                print(str(indx) + "/" + str(size))
        print("Done processing files")
        print("Sensors with missing data above the threshold: " +
              str(bad_data_sensors))
        np.savetxt(bad_missing_data_sensors, bad_data_sensors, fmt='%s')

    if "--sort-end-frames" in sys.argv:
        print("Sorting data frames according to date")
        data_files = [
            os.path.join(files_dir, f) for f in os.listdir(files_dir)
            if os.path.isfile(os.path.join(files_dir, f))
        ]
        size = len(data_files)
        for ind, f in enumerate(data_files):
            df = pd.read_csv(f,
                             sep=';',
                             parse_dates=True,
                             index_col="timestamp")
            df.sort_index(ascending=True, inplace=True)
            df.to_csv(
                f,
                sep=";",
            )
        del (data_files)