Exemple #1
0
    def run_ALSO(self):

        # self.feature_weights, self.df_raw_residuals = self.compute_weights_and_residuals_by_features(self.full_data_set)
        _, self.df_raw_residuals = self.compute_weights_and_residuals_by_features(
            self.full_data_set)
        self.feature_weights = compute_weights_RRSE(self.df_raw_residuals,
                                                    self.full_data_set)
        self.df_also_residuals = self.df_raw_residuals.applymap(lambda x: x**2)
        self.df_also_residuals, _ = standardize(self.df_also_residuals)
        self.df_also_residuals = self.apply_feature_weights(
            self.df_also_residuals)

        #adding some measure of dispersion should give us some inkling
        #of volatility of a point
        #std is fine for now but it doesn't properly encapsulate change
        #of sign, which is quite relavent
        self.df_also_residuals['also_std'] = self.df_raw_residuals.std(axis=1)
        self.df_also_residuals['also_residual'] = self.df_also_residuals.sum(
            axis=1)
        self.df_also_residuals['also_residual'] = self.df_also_residuals[
            'also_residual'].apply(lambda x: x**0.5)
        self.df_also_residuals['also_outlier_score'] = outlier_scorer(
            self.df_also_residuals['also_residual'])

        return self.df_also_residuals
    def outlier_score_all_datapoints(self):

        self.feature_weights, self.df_residuals = self.compute_weights_and_residuals_by_features(
            self.full_data_set)
        df_residuals_scaled, _ = standardize(self.df_residuals)
        df_residuals_scaled_wtd = self.apply_feature_weights(
            df_residuals_scaled)

        return self.compute_outlier_scores(df_residuals_scaled_wtd)
Exemple #3
0
    def __init__(self, pandas_dataframe_dataset, wts, target_item=None):
        self.full_data_set = pandas_dataframe_dataset[:]
        self.full_data_set_scaled, self.scaler = standardize(
            self.full_data_set)

        self.wts = wts
        self.features_of_interest = list(self.wts.keys())

        if not target_item:
            self.target_item = self.find_ideal_item(self.full_data_set_scaled)
        else:
            self.target_item = self.scaler.transform(
                [self.target_item[col] for col in self.features_of_interest])
Exemple #4
0
def prepare(x):
    """
    Prepare the data by standardizing and replacing unused 
    values (-999) by the mean of their columns such that they
    don't affect the computation then.
    """
    # Here we put the non sense values (-999) to mean
    # such that then with the standardization they will be set to 0
    # And we count the number of -999 values to add this information to
    N = x.shape[0]
    novalues_len = np.zeros((x.shape[0], x.shape[1]))
    useless_features = []

    xt = np.copy(x.T)
    i = 0
    for xi in xt:
        xi[xi == -999] = np.nan
        nanidx = np.where(np.isnan(xi))
        number_noval = nanidx[0].shape[0]
        if number_noval >= N / 2:
            useless_features.append(i)
        i = i + 1

    i = 0
    for xi in xt.T:
        nanidx = np.where(np.isnan(xi))
        novalues_len[i] = nanidx[0].shape[0]
        i = i + 1

    for xi in xt:
        xi[xi == -999] = np.nan
        m = np.nanmean(xi)
        nanidx = np.where(np.isnan(xi))
        xi[nanidx] = m

    tx = xt.T
    tx = np.delete(tx, useless_features, axis=1)
    tx = np.hstack((tx, novalues_len))

    tx, mean, std = standardize(tx)

    return tx
Exemple #5
0
def preprocess_data(x, y, augment=True, clean=True):
    """
    return in an array at postion 0, 1, and 2 the rows of x
    where the jet value is 0, 1 and (2 or 3).
    """

    jet_indices = get_jet_indices(x)
    xx = []
    yy = []

    xx.append(x[jet_indices[0]])
    yy.append(y[jet_indices[0]])

    xx.append(x[jet_indices[1]])
    yy.append(y[jet_indices[1]])

    #We put jet values of 2 and 3 together since
    #they have the same columns to keep
    jet_2_3 = np.logical_or(jet_indices[2], jet_indices[3])
    xx.append(x[jet_2_3])
    yy.append(y[jet_2_3])

    #clean each dataset
    if clean:
        xx = clean_data(xx)

    #standardize each dataset
    for i in range(3):
        x_stand, _, _ = standardize(xx[i])
        xx[i] = x_stand

    #augment each dataset
    if augment:
        xx = augment_data(xx)

    return xx, yy, jet_indices
import cv2  # computer vision library
import helpers
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Image data directories
image_dir_training = "day_night_images/training/"
image_dir_test = "day_night_images/test/"

# Using the load_dataset function in helpers.py
# Load training data
IMAGE_LIST = helpers.load_dataset(image_dir_training)

# Standardize all training images
STANDARDIZED_LIST = helpers.standardize(IMAGE_LIST)

# Display a standardized image and its label

# Select an image by index
image_num = 0
selected_image = STANDARDIZED_LIST[image_num][0]
selected_label = STANDARDIZED_LIST[image_num][1]

# Display image and data about it
# plt.imshow(selected_image)
# print("Shape: "+str(selected_image.shape))
# print("Label [1 = day, 0 = night]: " + str(selected_label))


# Find the average Value or brightness of an image
        thr = thr + learning_rate if post_flag == 1 else thr - learning_rate
        temp_accuracy = get_accuracy(thr)
        print("when threshold is " + str(thr) + ", accuracy is " + str(temp_accuracy))
        if temp_accuracy > max_accuracy:
            max_accuracy = temp_accuracy
            best_threshold = thr
        else:
            if temp_accuracy <= last_accuracy:
                post_flag = 1 if post_flag == 0 else 0
            learning_rate /= 2.
        last_accuracy = temp_accuracy
    return best_threshold, max_accuracy


def get_accuracy(threshold=120):
    MISCLASSIFIED = get_misclassified_images(STANDARDIZED_TEST_LIST, threshold)
    total = len(STANDARDIZED_TEST_LIST)
    num_correct = total - len(MISCLASSIFIED)
    accuracy = num_correct / total
    return accuracy


if __name__ == '__main__':
    image_dir_training = "day_night_images/training/"
    image_dir_test = "day_night_images/test/"
    TEST_IMAGE_LIST = helpers.load_dataset(image_dir_test)
    STANDARDIZED_TEST_LIST = helpers.standardize(TEST_IMAGE_LIST)
    random.shuffle(STANDARDIZED_TEST_LIST)
    MISCLASSIFIED = []
    auto_judge(epochs=10)
from classification import *
from helpers import standardize

# simple test case
data_path = 'binary.csv'
data = np.genfromtxt(data_path, delimiter=",", skip_header=1)
yd = data[:, 0].astype(np.int)
xd = data[:, 1:]

#make a linear feature matrix
feat, f_mean, f_std = standardize(xd)

#learn the model
w0 = np.zeros(4)
model = logistic_regression(yd, feat, w0, 0.01, 50)

#simple validation
test_path = 'binary_test.csv'
test = np.genfromtxt(test_path, delimiter=",", skip_header=1)
yt = test[:, 0].astype(np.int)
xt = test[:, 1:]

feat_test, f_mean, f_std = standardize(xt)
correct_predictions = 0

for i in range(0, len(yd)):
    x = feat[i, :]
    y_p = logistic_sigmoid(np.dot(x, model))
    if np.rint(y_p) == yd[i]:
        correct_predictions += 1
# # use nearest neighbor (takes ~30 minutes for train data)
# (id, y, X) = clean_data.nearest_neighbour(id, y, X, invalid_field_value)

# (id, y, X) = clean_data.avg_incomplete_cols(id, y, X, invalid_field_value)

# # use only rows that have no incomplete values
# (id, y, X) = clean_data.full_rows(id, y, X, invalid_field_value)

# use only cols that have no incomplete values
# (id, y, X) = clean_data.full_cols(id, y, X, invalid_field_value)

################################################################################
#                      standardize & polynomial expansion                      #
################################################################################
X = polynomial_expansion.polynomial_expansion(X, degree)
(X, _, _) = helpers.standardize(X)
X = helpers.add_offset_parameter(X)

################################################################################
#                               search for lambda                              #
################################################################################
# processes a specific lambda through the cross-validation pipeline and writes
# results to a pickled file for further analysis.
def process_lambda_grid_search(id, y, X, fold_count, seed, gd_func, max_iters, gamma, lamb, degree):
    N, D = X.shape
    initial_w = np.ones(D)

    # k-fold cross-validation
    (w_stars, train_correct_ratios, test_correct_ratios) = \
        cross_validation.cross_validate(id, y, X, fold_count, seed, gd_func,
                                        initial_w, max_iters, gamma, lamb)
Exemple #10
0
- Take 1/3 of the dataset of  observation. Therefore the regularisation
  performed over 26 320 observations. (call of 'generate_data()')
- These observations are then standardized according thanks to 'standardize()'
  method
- The model is constructed by calling 'create_choice_model' of pylogit which
  has as input :
      The dataset
      The definition of the utility realised by 'create_specification()'
      The model type, here 'MNL' for multinomial logit
"""

long_lpmc = gld.generate_data(train=True) # train=False for generating the test dataset
y = long_lpmc.copy()

# standardize what has to be standardized : custom_id, mode_id etc.. are ignored
y.iloc[:, 3::1] = helpers.standardize(long_lpmc.iloc[:, 3::1])


choice_column = "travel_mode"
obs_id_column = "custom_id"
custom_alt_id = "mode_id"
basic_specification = helpers.create_specification()

lpmc_mnltrain = pl.create_choice_model(data=y,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=None)
'''
selected_image = training_images[0][0]
plt.title(training_images[0][1])
plt.imshow(selected_image)
plt.show()
'''

# Importing the tests
import test_functions
tests = test_functions.Tests()

# Test for one_hot_encode function
tests.test_one_hot(helpers.one_hot_encode)

# Standardize all training and test images
standardized_training_images = helpers.standardize(training_images)
standardized_testing_images = helpers.standardize(testing_images)

import random as rd

## Display a random standardized images and its label
'''
random_std_image = rd.choice(standardized_training_images)
plt.title(random_std_image[1])
plt.imshow(random_std_image[0])
'''

red_entries, yellow_entries, green_entries = tuple(
    helpers.get_one_color(standardized_training_images, one_hot)
    for one_hot in [[1, 0, 0], [0, 1, 0], [0, 0, 1]])
red = np.array(red_entries)[:, 0]
Exemple #12
0
################################################################################
#                                   clean data                                 #
################################################################################
# Clean the train and test data - remove -999 outliers.
X, XTest = clean_data.outliers_to_col_avg(X, XTest)

# one-hot coding for "PRI_jet_num" (column 22)
(id, y, X) = clean_data.one_hot_PRI_jet_num(id, y, X)
(idTest, yTest, XTest) = clean_data.one_hot_PRI_jet_num(idTest, yTest, XTest)

################################################################################
#                      standardize & polynomial expansion                      #
################################################################################
X = polynomial_expansion.polynomial_expansion(X, degree)
(X, _, _) = helpers.standardize(X)
X = helpers.add_offset_parameter(X)

XTest = polynomial_expansion.polynomial_expansion(XTest, degree)
(XTest, _, _) = helpers.standardize(XTest)
XTest = helpers.add_offset_parameter(XTest)

################################################################################
#                                     train                                    #
################################################################################
N, D = X.shape
initial_w = np.random.uniform(initial_w_range[0], initial_w_range[1], D)

# # Train model & select "optimal" model parameters found
# (losses, ws) = gd_func(y, X, initial_w, max_iters, gamma, lamb)
# w_star = ws[-1]
Exemple #13
0
        for f in CSV_FILES
    ]

    # Concatenate all files into one big dataframe
    master = pd.concat(dfs).reset_index()

    # Ensure targets are valid values (1-7). There are some 0s in there.
    master_valid = master[master['target'].isin(VALID_TARGETS)]

    feature_matrix = []
    # looping over groups helps make sure we don't
    # consider windows with more than one target
    for target, df in master_valid.groupby('target'):

        # 0-center the mean and normalize
        df[DATA_COLS] = standardize(df[DATA_COLS])

        print("Processing %d rows for target #%d..." % (len(df), target))

        grp = defaultdict(list)
        grp['target'] = target
        samples = window_df(df,
                            width=N_SECONDS * SAMPLING_RATE,
                            overlap=OVERLAP)
        for sample in samples:

            means = sample[DATA_COLS].mean()
            grp['x_mean'].append(means['x_accel'])
            grp['y_mean'].append(means['y_accel'])
            grp['z_mean'].append(means['z_accel'])
    # Ignore index column because we re-index to ensure integrity.
    dfs = [pd.read_csv(f, names=COLS, usecols=DATA_COLS+TARGET_COL) for f in CSV_FILES]
    
    # Concatenate all files into one big dataframe
    master = pd.concat(dfs).reset_index()

    # Ensure targets are valid values (1-7). There are some 0s in there.
    master_valid = master[master['target'].isin(VALID_TARGETS)]

    feature_matrix = []
    # looping over groups helps make sure we don't
    # consider windows with more than one target
    for target, df in master_valid.groupby('target'):
        
        # 0-center the mean and normalize
        df[DATA_COLS] = standardize(df[DATA_COLS])
        
        print("Processing %d rows for target #%d..."%(len(df), target))

        grp = defaultdict(list)
        grp['target'] = target
        samples = window_df(df, 
                            width=N_SECONDS*SAMPLING_RATE, 
                            overlap=OVERLAP)
        for sample in samples:
            
            means = sample[DATA_COLS].mean()
            grp['x_mean'].append(means['x_accel'])
            grp['y_mean'].append(means['y_accel'])
            grp['z_mean'].append(means['z_accel'])