def process_pca(): if not os.path.exists(get_pca_dir()): os.makedirs(get_pca_dir()) pca = PCA( 0.95 ) # 0.95 means that the maximum number of columns will be removed and accuracy will stay above 95% data, target, headers = get_data(get_transformed_dir() + 'dataset.csv') data = StandardScaler().fit_transform(data) pca.fit(data) pca_data = pca.transform(data) plt.semilogy(pca.explained_variance_ratio_, '--o') plt.show() pca_headers = ['target'] for i in range(pca.n_components_): pca_headers.append('PCA_component_' + str(i + 1)) write_to_csv(get_pca_dir() + 'dataset.csv', pca_data, target, pca_headers) print pca
def get_data(location=get_transformed_dir()): t1 = time() data = [] target = [] header = [] with open(location, 'rb') as f: reader = csv.reader(f) for i, row in enumerate(reader): if i == 0: header = row elif i != 0: try: data.append(map(int, row)) except ValueError: data.append(map(float, row)) for i in range(len(data)): target.append(data[i][0]) # del data[i][30] del data[i][0] data = np.array(data) t2 = time() print 'Getting data from ' + location + '\ntook: ' + str(t2 - t1) + ' sec\n' return data, target, header
def evaluate_sequential_nn(method, dataset_location=get_transformed_dir()): data, target = get_test_data(dataset_location) if data.dtype != float: data = StandardScaler().fit_transform(data) print 'Dataset scaled' prediction = list( np.rint(method.predict(data)).astype(int).astype(str).flatten('F')) target = map(np.str, target) print('Sequential neural network: ') print_f_measure(target, prediction)
def select_k_best(): if not os.path.exists(get_k_best_dir()): os.makedirs(get_k_best_dir()) data, target, headers = get_data(get_transformed_dir() + 'dataset.csv') # data = normalize(data, axis=0, norm='max') data = SelectKBest(chi2, k=40).fit_transform(data, target) headers = ['target'] for i in range(data.shape[1]): headers.append('PCA_component_' + str(i + 1)) write_to_csv(get_k_best_dir() + 'dataset.csv', data, target, headers)
def evaluate(method, method_name, scaled=False, dataset_location=get_transformed_dir()): data, target = get_test_data(dataset_location) if scaled: data = StandardScaler().fit_transform(data) print 'Dataset scaled' prediction = method.predict(data).astype(float) target = np.array(target, dtype=float) print(method_name + ':') print_f_measure(target, prediction)
def evaluate_neural_network(network, dataset_location=get_transformed_dir()): data, target = get_test_data(dataset_location) if data.dtype != float: data = StandardScaler().fit_transform(data) print 'Dataset scaled' test = [] for i in range(len(data)): temp = np.array(data[i], dtype='float64') test.append(Instance(temp)) prediction = list( np.rint(network.predict(test)).astype(int).astype(str).flatten('F')) target = map(np.str, target) print('Scaled conjugate network: ') print_f_measure(target, prediction)
from os.path import isfile from keras import Sequential, optimizers from keras.layers import Dense from keras.models import load_model from sklearn.preprocessing import StandardScaler from dataset_process.data_io import get_train_data, get_validation_data from evaluation.evaluation import evaluate_sequential_nn from global_variables import get_transformed_dir dataset_location = get_transformed_dir() def learn_nn(): data, target = get_train_data(dataset_location) if data.dtype != float: print 'Scaling dataset' data = StandardScaler().fit_transform(data) model = Sequential() model.add(Dense(80, input_dim=data.shape[1], activation='tanh')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='tanh')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='tanh')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='tanh')) model.add(Dense(10, activation='tanh')) model.add(Dense(1, activation='relu'))
from dataset_process.data_io import get_data from global_variables import get_transformed_dir # this script checks distribution of accident severity in train, validate and test def view_severity_distribution(location='../dataset/transformed/dataset.csv'): data, target, header = get_data(location) values = {} for value in target: if value in values: values[value] += 1 else: values[value] = 1 for key, value in values.iteritems(): print str(key) + ': ' + str( round(float(values[key]) / len(target) * 100, 2)) + '%' def print_distribution_for_train_validate_test(root_dir): view_severity_distribution(location=root_dir + 'dataset.csv') view_severity_distribution(location=root_dir + 'train.csv') view_severity_distribution(location=root_dir + 'validate.csv') view_severity_distribution(location=root_dir + 'test.csv') print_distribution_for_train_validate_test(get_transformed_dir())
elif float(len(non_null)) / data.shape[0] < 0.5: print 'This index should be altered: ' + str(i) else: temp[temp == 'NaN'] = np.median(non_null) print '4:6 - Removed missing values and replaced with median values' # vehicle_type = data[:, 2].reshape(-1, 1) # # one_hot_encoder = OneHotEncoder(sparse=False) # one_hot_encoded = one_hot_encoder.fit_transform(vehicle_type) # data = np.concatenate((data, one_hot_encoded), axis=1) # data = np.delete(data, 2, 1) # header = np.delete(header, 2) # # print "4:6 - Transformed categorical data" data = data.astype(np.float) data = data.astype(np.int64) data = data.astype(np.str) print '5:6 - Conversion done' with open(get_transformed_dir() + 'dataset.csv', 'w') as trans_file: trans_file.write(",".join(header)) trans_file.write('\n') for index in range(len(data)): trans_file.write(",".join(data[index])) trans_file.write('\n') print '6:6 - Data written in transformed/dataset.csv file'
def get_test_data(root_dir=get_transformed_dir()): data, target, header = get_data(root_dir + 'test.csv') return data, target
def get_validation_data(root_dir=get_transformed_dir()): data, target, header = get_data(root_dir + 'validate.csv') return data, target
train_data, validate_data, train_target, validate_target = train_test_split( train_data, train_target, test_size=len(target) / 10) with open(root_folder + 'train.csv', 'w') as train_file: train_file.write(",".join(header)) train_file.write('\n') for i, row in enumerate(train_data): train_file.write( str(train_target[i]) + "," + str(",".join(map(str, row)))) train_file.write('\n') with open(root_folder + 'validate.csv', 'w') as validate_file: validate_file.write(",".join(header)) validate_file.write('\n') for i, row in enumerate(validate_data): validate_file.write( str(validate_target[i]) + "," + str(",".join(map(str, row)))) validate_file.write('\n') with open(root_folder + 'test.csv', 'w') as test_file: test_file.write(",".join(header)) test_file.write('\n') for i, row in enumerate(test_data): test_file.write( str(test_target[i]) + "," + str(",".join(map(str, row)))) test_file.write('\n') divide_dataset(get_transformed_dir())