def run_ALSO(self): # self.feature_weights, self.df_raw_residuals = self.compute_weights_and_residuals_by_features(self.full_data_set) _, self.df_raw_residuals = self.compute_weights_and_residuals_by_features( self.full_data_set) self.feature_weights = compute_weights_RRSE(self.df_raw_residuals, self.full_data_set) self.df_also_residuals = self.df_raw_residuals.applymap(lambda x: x**2) self.df_also_residuals, _ = standardize(self.df_also_residuals) self.df_also_residuals = self.apply_feature_weights( self.df_also_residuals) #adding some measure of dispersion should give us some inkling #of volatility of a point #std is fine for now but it doesn't properly encapsulate change #of sign, which is quite relavent self.df_also_residuals['also_std'] = self.df_raw_residuals.std(axis=1) self.df_also_residuals['also_residual'] = self.df_also_residuals.sum( axis=1) self.df_also_residuals['also_residual'] = self.df_also_residuals[ 'also_residual'].apply(lambda x: x**0.5) self.df_also_residuals['also_outlier_score'] = outlier_scorer( self.df_also_residuals['also_residual']) return self.df_also_residuals
def outlier_score_all_datapoints(self): self.feature_weights, self.df_residuals = self.compute_weights_and_residuals_by_features( self.full_data_set) df_residuals_scaled, _ = standardize(self.df_residuals) df_residuals_scaled_wtd = self.apply_feature_weights( df_residuals_scaled) return self.compute_outlier_scores(df_residuals_scaled_wtd)
def __init__(self, pandas_dataframe_dataset, wts, target_item=None): self.full_data_set = pandas_dataframe_dataset[:] self.full_data_set_scaled, self.scaler = standardize( self.full_data_set) self.wts = wts self.features_of_interest = list(self.wts.keys()) if not target_item: self.target_item = self.find_ideal_item(self.full_data_set_scaled) else: self.target_item = self.scaler.transform( [self.target_item[col] for col in self.features_of_interest])
def prepare(x): """ Prepare the data by standardizing and replacing unused values (-999) by the mean of their columns such that they don't affect the computation then. """ # Here we put the non sense values (-999) to mean # such that then with the standardization they will be set to 0 # And we count the number of -999 values to add this information to N = x.shape[0] novalues_len = np.zeros((x.shape[0], x.shape[1])) useless_features = [] xt = np.copy(x.T) i = 0 for xi in xt: xi[xi == -999] = np.nan nanidx = np.where(np.isnan(xi)) number_noval = nanidx[0].shape[0] if number_noval >= N / 2: useless_features.append(i) i = i + 1 i = 0 for xi in xt.T: nanidx = np.where(np.isnan(xi)) novalues_len[i] = nanidx[0].shape[0] i = i + 1 for xi in xt: xi[xi == -999] = np.nan m = np.nanmean(xi) nanidx = np.where(np.isnan(xi)) xi[nanidx] = m tx = xt.T tx = np.delete(tx, useless_features, axis=1) tx = np.hstack((tx, novalues_len)) tx, mean, std = standardize(tx) return tx
def preprocess_data(x, y, augment=True, clean=True): """ return in an array at postion 0, 1, and 2 the rows of x where the jet value is 0, 1 and (2 or 3). """ jet_indices = get_jet_indices(x) xx = [] yy = [] xx.append(x[jet_indices[0]]) yy.append(y[jet_indices[0]]) xx.append(x[jet_indices[1]]) yy.append(y[jet_indices[1]]) #We put jet values of 2 and 3 together since #they have the same columns to keep jet_2_3 = np.logical_or(jet_indices[2], jet_indices[3]) xx.append(x[jet_2_3]) yy.append(y[jet_2_3]) #clean each dataset if clean: xx = clean_data(xx) #standardize each dataset for i in range(3): x_stand, _, _ = standardize(xx[i]) xx[i] = x_stand #augment each dataset if augment: xx = augment_data(xx) return xx, yy, jet_indices
import cv2 # computer vision library import helpers import numpy as np import matplotlib.pyplot as plt import matplotlib.image as mpimg # Image data directories image_dir_training = "day_night_images/training/" image_dir_test = "day_night_images/test/" # Using the load_dataset function in helpers.py # Load training data IMAGE_LIST = helpers.load_dataset(image_dir_training) # Standardize all training images STANDARDIZED_LIST = helpers.standardize(IMAGE_LIST) # Display a standardized image and its label # Select an image by index image_num = 0 selected_image = STANDARDIZED_LIST[image_num][0] selected_label = STANDARDIZED_LIST[image_num][1] # Display image and data about it # plt.imshow(selected_image) # print("Shape: "+str(selected_image.shape)) # print("Label [1 = day, 0 = night]: " + str(selected_label)) # Find the average Value or brightness of an image
thr = thr + learning_rate if post_flag == 1 else thr - learning_rate temp_accuracy = get_accuracy(thr) print("when threshold is " + str(thr) + ", accuracy is " + str(temp_accuracy)) if temp_accuracy > max_accuracy: max_accuracy = temp_accuracy best_threshold = thr else: if temp_accuracy <= last_accuracy: post_flag = 1 if post_flag == 0 else 0 learning_rate /= 2. last_accuracy = temp_accuracy return best_threshold, max_accuracy def get_accuracy(threshold=120): MISCLASSIFIED = get_misclassified_images(STANDARDIZED_TEST_LIST, threshold) total = len(STANDARDIZED_TEST_LIST) num_correct = total - len(MISCLASSIFIED) accuracy = num_correct / total return accuracy if __name__ == '__main__': image_dir_training = "day_night_images/training/" image_dir_test = "day_night_images/test/" TEST_IMAGE_LIST = helpers.load_dataset(image_dir_test) STANDARDIZED_TEST_LIST = helpers.standardize(TEST_IMAGE_LIST) random.shuffle(STANDARDIZED_TEST_LIST) MISCLASSIFIED = [] auto_judge(epochs=10)
from classification import * from helpers import standardize # simple test case data_path = 'binary.csv' data = np.genfromtxt(data_path, delimiter=",", skip_header=1) yd = data[:, 0].astype(np.int) xd = data[:, 1:] #make a linear feature matrix feat, f_mean, f_std = standardize(xd) #learn the model w0 = np.zeros(4) model = logistic_regression(yd, feat, w0, 0.01, 50) #simple validation test_path = 'binary_test.csv' test = np.genfromtxt(test_path, delimiter=",", skip_header=1) yt = test[:, 0].astype(np.int) xt = test[:, 1:] feat_test, f_mean, f_std = standardize(xt) correct_predictions = 0 for i in range(0, len(yd)): x = feat[i, :] y_p = logistic_sigmoid(np.dot(x, model)) if np.rint(y_p) == yd[i]: correct_predictions += 1
# # use nearest neighbor (takes ~30 minutes for train data) # (id, y, X) = clean_data.nearest_neighbour(id, y, X, invalid_field_value) # (id, y, X) = clean_data.avg_incomplete_cols(id, y, X, invalid_field_value) # # use only rows that have no incomplete values # (id, y, X) = clean_data.full_rows(id, y, X, invalid_field_value) # use only cols that have no incomplete values # (id, y, X) = clean_data.full_cols(id, y, X, invalid_field_value) ################################################################################ # standardize & polynomial expansion # ################################################################################ X = polynomial_expansion.polynomial_expansion(X, degree) (X, _, _) = helpers.standardize(X) X = helpers.add_offset_parameter(X) ################################################################################ # search for lambda # ################################################################################ # processes a specific lambda through the cross-validation pipeline and writes # results to a pickled file for further analysis. def process_lambda_grid_search(id, y, X, fold_count, seed, gd_func, max_iters, gamma, lamb, degree): N, D = X.shape initial_w = np.ones(D) # k-fold cross-validation (w_stars, train_correct_ratios, test_correct_ratios) = \ cross_validation.cross_validate(id, y, X, fold_count, seed, gd_func, initial_w, max_iters, gamma, lamb)
- Take 1/3 of the dataset of observation. Therefore the regularisation performed over 26 320 observations. (call of 'generate_data()') - These observations are then standardized according thanks to 'standardize()' method - The model is constructed by calling 'create_choice_model' of pylogit which has as input : The dataset The definition of the utility realised by 'create_specification()' The model type, here 'MNL' for multinomial logit """ long_lpmc = gld.generate_data(train=True) # train=False for generating the test dataset y = long_lpmc.copy() # standardize what has to be standardized : custom_id, mode_id etc.. are ignored y.iloc[:, 3::1] = helpers.standardize(long_lpmc.iloc[:, 3::1]) choice_column = "travel_mode" obs_id_column = "custom_id" custom_alt_id = "mode_id" basic_specification = helpers.create_specification() lpmc_mnltrain = pl.create_choice_model(data=y, alt_id_col=custom_alt_id, obs_id_col=obs_id_column, choice_col=choice_column, specification=basic_specification, model_type="MNL", names=None)
''' selected_image = training_images[0][0] plt.title(training_images[0][1]) plt.imshow(selected_image) plt.show() ''' # Importing the tests import test_functions tests = test_functions.Tests() # Test for one_hot_encode function tests.test_one_hot(helpers.one_hot_encode) # Standardize all training and test images standardized_training_images = helpers.standardize(training_images) standardized_testing_images = helpers.standardize(testing_images) import random as rd ## Display a random standardized images and its label ''' random_std_image = rd.choice(standardized_training_images) plt.title(random_std_image[1]) plt.imshow(random_std_image[0]) ''' red_entries, yellow_entries, green_entries = tuple( helpers.get_one_color(standardized_training_images, one_hot) for one_hot in [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) red = np.array(red_entries)[:, 0]
################################################################################ # clean data # ################################################################################ # Clean the train and test data - remove -999 outliers. X, XTest = clean_data.outliers_to_col_avg(X, XTest) # one-hot coding for "PRI_jet_num" (column 22) (id, y, X) = clean_data.one_hot_PRI_jet_num(id, y, X) (idTest, yTest, XTest) = clean_data.one_hot_PRI_jet_num(idTest, yTest, XTest) ################################################################################ # standardize & polynomial expansion # ################################################################################ X = polynomial_expansion.polynomial_expansion(X, degree) (X, _, _) = helpers.standardize(X) X = helpers.add_offset_parameter(X) XTest = polynomial_expansion.polynomial_expansion(XTest, degree) (XTest, _, _) = helpers.standardize(XTest) XTest = helpers.add_offset_parameter(XTest) ################################################################################ # train # ################################################################################ N, D = X.shape initial_w = np.random.uniform(initial_w_range[0], initial_w_range[1], D) # # Train model & select "optimal" model parameters found # (losses, ws) = gd_func(y, X, initial_w, max_iters, gamma, lamb) # w_star = ws[-1]
for f in CSV_FILES ] # Concatenate all files into one big dataframe master = pd.concat(dfs).reset_index() # Ensure targets are valid values (1-7). There are some 0s in there. master_valid = master[master['target'].isin(VALID_TARGETS)] feature_matrix = [] # looping over groups helps make sure we don't # consider windows with more than one target for target, df in master_valid.groupby('target'): # 0-center the mean and normalize df[DATA_COLS] = standardize(df[DATA_COLS]) print("Processing %d rows for target #%d..." % (len(df), target)) grp = defaultdict(list) grp['target'] = target samples = window_df(df, width=N_SECONDS * SAMPLING_RATE, overlap=OVERLAP) for sample in samples: means = sample[DATA_COLS].mean() grp['x_mean'].append(means['x_accel']) grp['y_mean'].append(means['y_accel']) grp['z_mean'].append(means['z_accel'])
# Ignore index column because we re-index to ensure integrity. dfs = [pd.read_csv(f, names=COLS, usecols=DATA_COLS+TARGET_COL) for f in CSV_FILES] # Concatenate all files into one big dataframe master = pd.concat(dfs).reset_index() # Ensure targets are valid values (1-7). There are some 0s in there. master_valid = master[master['target'].isin(VALID_TARGETS)] feature_matrix = [] # looping over groups helps make sure we don't # consider windows with more than one target for target, df in master_valid.groupby('target'): # 0-center the mean and normalize df[DATA_COLS] = standardize(df[DATA_COLS]) print("Processing %d rows for target #%d..."%(len(df), target)) grp = defaultdict(list) grp['target'] = target samples = window_df(df, width=N_SECONDS*SAMPLING_RATE, overlap=OVERLAP) for sample in samples: means = sample[DATA_COLS].mean() grp['x_mean'].append(means['x_accel']) grp['y_mean'].append(means['y_accel']) grp['z_mean'].append(means['z_accel'])