Beispiel #1
0
def read_data_sets(dir, fake_data=False):
    class DataSets(object):
        pass

    data_sets = DataSets()
    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True)
        data_sets.validation = DataSet([], [], fake_data=True)
        data_sets.test = DataSet([], [], fake_data=True)
        return data_sets

    TRAIN_IMAGES = "ipcai_revision_colon_mean_scattering_train_all_spectrocam.txt"
    TEST_IMAGES = "ipcai_revision_colon_mean_scattering_test_all_spectrocam.txt"

    df_train = pd.read_csv(os.path.join(dir, TRAIN_IMAGES), header=[0, 1])
    df_test = pd.read_csv(os.path.join(dir, TEST_IMAGES), header=[0, 1])

    train_images, train_labels = preprocess(df_train, snr=10.0)
    test_images, test_labels = preprocess(df_test, snr=10.0)

    train_labels = train_labels.values
    test_labels = test_labels.values

    VALIDATION_SIZE = 1

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]
    data_sets.train = DataSet(train_images, train_labels)
    data_sets.validation = DataSet(validation_images, validation_labels)
    data_sets.test = DataSet(test_images, test_labels)
    return data_sets
def read_data_sets(dir, fake_data=False):
    class DataSets(object):
        pass

    data_sets = DataSets()
    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True)
        data_sets.validation = DataSet([], [], fake_data=True)
        data_sets.test = DataSet([], [], fake_data=True)
        return data_sets

    TRAIN_IMAGES = "ipcai_revision_colon_mean_scattering_train_all_spectrocam.txt"
    TEST_IMAGES = "ipcai_revision_colon_mean_scattering_test_all_spectrocam.txt"

    df_train = pd.read_csv(os.path.join(dir, TRAIN_IMAGES), header=[0, 1])
    df_test = pd.read_csv(os.path.join(dir, TEST_IMAGES), header=[0, 1])

    train_images, train_labels = preprocess(df_train, snr=10.)
    test_images, test_labels = preprocess(df_test, snr=10.)

    train_labels = train_labels.values
    test_labels = test_labels.values

    VALIDATION_SIZE = 1

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]
    data_sets.train = DataSet(train_images, train_labels)
    data_sets.validation = DataSet(validation_images, validation_labels)
    data_sets.test = DataSet(test_images, test_labels)
    return data_sets
    def run(self):
        # get data
        df_train = pd.read_csv(self.input()[0].path, header=[0, 1])
        df_test = pd.read_csv(self.input()[1].path, header=[0, 1])

        # for this plot we write a custom evaluation function as it is built
        # a little different

        # create a new dataframe which will hold all the generated errors
        df = pd.DataFrame()

        nr_training_samples = np.arange(10, 15010, 50).astype(int)
        # not very pythonic, don't care
        for n in nr_training_samples:
            X_test, y_test = preprocess(df_test, snr=w_standard)
            # only take n samples for training
            X_train, y_train = preprocess(df_train,
                                          nr_samples=n,
                                          snr=w_standard)

            regressor = rf
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            # save results to a dataframe
            errors = np.abs(y_pred - y_test)
            errors = errors.reshape(len(errors), 1)
            current_df = DataFrame(errors * 100, columns=["Errors"])
            current_df["Method"] = "Proposed"
            current_df["Number Samples"] = n / 10**3.
            df = pd.concat([df, current_df], ignore_index=True)
            logging.info(
                "Finished training classifier with {0} samples".format(str(n)))

        df = df.groupby("Number Samples").describe()
        # get the error description in the rows:
        df = df.unstack(-1)
        # get rid of multiindex by dropping "Error" level
        df.columns = df.columns.droplevel(0)

        plt.figure()
        plt.plot(df.index, df["50%"], color="green")

        # tidy up the plot
        plt.xlabel("number of training samples / 1000")
        plt.ylabel("absolute error [%]")
        plt.ylim((0, 20))
        plt.xlim((0, 15))
        plt.grid()

        # finally save the figure
        plt.savefig(self.output().path,
                    mode="pdf",
                    dpi=500,
                    bbox_inches='tight')
    def run(self):
        # get data
        df_train = pd.read_csv(self.input()[0].path, header=[0, 1])
        df_test = pd.read_csv(self.input()[1].path, header=[0, 1])

        # for this plot we write a custom evaluation function as it is built
        # a little different

        # create a new dataframe which will hold all the generated errors
        df = pd.DataFrame()

        nr_training_samples = np.arange(10, 15010, 50).astype(int)
        # not very pythonic, don't care
        for n in nr_training_samples:
            X_test, y_test = preprocess(df_test, snr=w_standard)
            # only take n samples for training
            X_train, y_train = preprocess(df_train, nr_samples=n,
                                          snr=w_standard)

            regressor = rf
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            # save results to a dataframe
            errors = np.abs(y_pred - y_test)
            errors = errors.reshape(len(errors), 1)
            current_df = DataFrame(errors * 100,
                                   columns=["Errors"])
            current_df["Method"] = "Proposed"
            current_df["Number Samples"] = n / 10**3.
            df = pd.concat([df, current_df], ignore_index=True)
            logging.info(
                    "Finished training classifier with {0} samples".format(
                            str(n)))

        df = df.groupby("Number Samples").describe()
        # get the error description in the rows:
        df = df.unstack(-1)
        # get rid of multiindex by dropping "Error" level
        df.columns = df.columns.droplevel(0)

        plt.figure()
        plt.plot(df.index, df["50%"], color="green")

        # tidy up the plot
        plt.xlabel("number of training samples / 1000")
        plt.ylabel("absolute error [%]")
        plt.ylim((0, 20))
        plt.xlim((0, 15))
        plt.grid()

        # finally save the figure
        plt.savefig(self.output().path, mode="pdf", dpi=500,
                    bbox_inches='tight')
Beispiel #5
0
def create_dataset(path_to_simulation_results):

    df = pd.read_csv(path_to_simulation_results, header=[0, 1])

    X, y = preprocess(df, snr=10.0)
    y = y.values
    return X, y
def create_lmdb(path_to_simulation_results, lmdb_name):

    df = pd.read_csv(path_to_simulation_results, header=[0, 1])

    X, y = preprocess(df, snr=10.)
    y = y.values * 1000

    # We need to prepare the database for the size. We'll set it 10 times
    # greater than what we theoretically need. There is little drawback to
    # setting this too big. If you still run into problem after raising
    # this, you might want to try saving fewer entries in a single
    # transaction.
    map_size = X.nbytes * 10

    env = lmdb.open(lmdb_name, map_size=map_size)

    with env.begin(write=True) as txn:
        # txn is a Transaction object
        for i in range(X.shape[0]):
            datum = caffe.proto.caffe_pb2.Datum()
            datum.channels = X.shape[1]
            datum.height = 1
            datum.width = 1
            datum.data = X[i].tobytes()  # or .tostring() if numpy < 1.9
            datum.label = int(y[i])
            str_id = '{:08}'.format(i)

            # The encode is only essential in Python 3
            txn.put(str_id.encode('ascii'), datum.SerializeToString())
Beispiel #7
0
def create_dataset(path_to_simulation_results):

    df = pd.read_csv(path_to_simulation_results, header=[0, 1])

    X, y = preprocess(df, snr=10.)
    y = y.values
    return X, y
    def run(self):
        # get data
        df_source = pd.read_csv(self.input()[0].path, header=[0, 1])
        df_target = pd.read_csv(self.input()[1].path, header=[0, 1])

        # first extract X_source and X_target, preprocessed at standard noise
        # level
        X_source, y_source = preprocess(df_source, w_percent=w_standard)
        X_target, y_target = preprocess(df_target, w_percent=w_standard)

        # train a classifier to determine probability for specific class
        weights = estimate_weights_random_forests(X_source, X_target, X_source)
        # add weight to dataframe
        df_source["weights"] = weights

        # finally save the dataframe with the added weights
        df_source.to_csv(self.output().path, index=False)
    def run(self):
        # get data
        df_source = pd.read_csv(self.input()[0].path, header=[0, 1])
        df_target = pd.read_csv(self.input()[1].path, header=[0, 1])

        # first extract X_source and X_target, preprocessed at standard noise
        # level
        X_source, y_source = preprocess(df_source, w_percent=w_standard)
        X_target, y_target = preprocess(df_target, w_percent=w_standard)

        # train a classifier to determine probability for specific class
        weights = estimate_weights_random_forests(X_source, X_target, X_source)
        # add weight to dataframe
        df_source["weights"] = weights

        # finally save the dataframe with the added weights
        df_source.to_csv(self.output().path, index=False)
def create_hdf5(path_to_simulation_results, hdf5_name):

    df = pd.read_csv(path_to_simulation_results, header=[0, 1])

    X, y = preprocess(df, snr=10.)
    y = y.values

    with h5py.File(hdf5_name,'w') as H:
        H.create_dataset('data', data=X )  # note the name X given to the dataset!
        H.create_dataset('label', data=y )  # note the name y given to the dataset!
    with open(hdf5_name + '_list.txt','w') as L:
        L.write(hdf5_name)  # list all h5 files you are going to use
Beispiel #11
0
def read_data_set(dataframe_filename, fake_data=False):

    if fake_data:
        data_set = DataSet([], [], fake_data=True)
        return data_set

    df_data_set = pd.read_csv(os.path.join(dir, dataframe_filename),
                              header=[0, 1])

    data_set_images, data_set_labels = preprocess(df_data_set, snr=10.)
    data_set_labels = data_set_labels.values
    data_set = DataSet(data_set_images, data_set_labels)
    return data_set
Beispiel #12
0
def read_data_set(dataframe_filename, fake_data=False):

    if fake_data:
        data_set = DataSet([], [], fake_data=True)
        return data_set

    df_data_set = pd.read_csv(os.path.join(dir, dataframe_filename),
                              header=[0, 1])

    data_set_images, data_set_labels = preprocess(df_data_set, snr=10.)
    data_set_labels = data_set_labels.values
    data_set = DataSet(data_set_images, data_set_labels)
    return data_set
Beispiel #13
0
def create_hdf5(path_to_simulation_results, hdf5_name):

    df = pd.read_csv(path_to_simulation_results, header=[0, 1])

    X, y = preprocess(df, snr=10.)
    y = y.values

    with h5py.File(hdf5_name, 'w') as H:
        H.create_dataset('data',
                         data=X)  # note the name X given to the dataset!
        H.create_dataset('label',
                         data=y)  # note the name y given to the dataset!
    with open(hdf5_name + '_list.txt', 'w') as L:
        L.write(hdf5_name)  # list all h5 files you are going to use