def setUpClass(self): self.DEBUG = False self.METRICS = False self.data_api_impl = DataApi('../../../data/') self.cross_validator_impl = CrossValidator() self.preprocessor_impl = Preprocessor()
def setUpClass(self): self.DEBUG = False self.METRICS = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../../data/') self.distance_functions_impl = DistanceFunctions()
def __init__(self): # logger instance - VERBOSE level is highest (most verbose) level for logging self.logger = Logger('DEMO') # configure log level here # datalayer instance - read csv data files and convert into raw data frames self.datalayer = DataApi('../../data/') # preprocessor instance - everything for prerocessing data frames self.preprocessor = Preprocessor() # cross_validator instance - setup cross validation partitions self.cross_validator = CrossValidator() # utils instance - random things self.utils = Utils()
def __init__(self): KNN.__init__(self) self.DEBUG = True self.VERBOSE = False self.data_api_impl = DataApi('../../data/') self.utilities_impl = Utilities() self.distance_functions_impl = DistanceFunctions() # threshold for clustering convergence # stop iterating when differences between consecutive centroids is smaller than this self.CONVERGENCE_THRESHOLD = 0.25 # maximum clustering iterations allowed before returning answer self.MAX_ITERATIONS = 5
class ViewData(): def __init__(self): self.DEBUG = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../data/') def view_iris_data(self): print('\nIRIS DATA:\n') print(self.data_api_impl.get_iris_data())
def __init__(self): self.DEBUG = True self.VERBOSE = False self.data_api_impl = DataApi('../../../data/') self.data_set = None self.CLASSIFICATION = True self.REGRESSION = False self.algorithm_name = None
def __init__(self): self.DEBUG = False # get instances of all the classes needed to run an experiment self.data_api_impl = DataApi('../../data/') self.preprocessor_impl = Preprocessor() self.cross_validator_impl = CrossValidator() self.parameter_tuner_impl = ParameterTuner() # algorithm implementations self.knn_impl = KNN() self.enn_impl = EditedKNN() self.cnn_impl = CondensedKNN() self.kmeans_knn_impl = KMeansClustering() self.k_medoids_clustering_impl = KMedoidsClustering() self.results_processor_impl = Results() self.CLASSIFICATION = False self.REGRESSION = False
class ViewData(): def __init__(self): self.DEBUG = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../data/') def view_abalone_data(self): print('\nABALONE DATA:\n') print(self.data_api_impl.get_raw_data_frame('abalone')) def view_car_data(self): print('\nCAR DATA:\n') print(self.data_api_impl.get_raw_data_frame('car')) def view_forestfires_data(self): print('\nFORESTFIRES DATA:\n') print(self.data_api_impl.get_raw_data_frame('forestfires')) def view_machine_data(self): print('\nMACHINE DATA:\n') print(self.data_api_impl.get_raw_data_frame('machine')) def view_segmentation_data(self): print('\nSEGMENTATION DATA:\n') print(self.data_api_impl.get_raw_data_frame('segmentation')) def view_wine_data(self): print('\nWINE DATA:\n') print(self.data_api_impl.get_raw_data_frame('wine'))
class DistanceFunctionsTests(unittest.TestCase): # SETUP @classmethod def setUpClass(self): self.DEBUG = False self.METRICS = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../../data/') self.distance_functions_impl = DistanceFunctions() @classmethod def tearDownClass(self): pass # TESTS # test get manhattan distance def test_get_manhattan_distance(self): pass # test get euclidean distance def test_get_euclidean_distance(self): abalone_data = self.data_api_impl.get_raw_data_frame('abalone') self.assertTrue(abalone_data is not None) distance_1_2 = self.distance_functions_impl.get_euclidean_distance(abalone_data[0,:], abalone_data[1,:]) print('distance_1_2: ' + str(distance_1_2))
self.weights = [ np.random.randn(self.layer_sizes[2], self.layer_sizes[1]) ] # the first node in every list correpsonds to the the 1 output from top to bottom. Print-wise, this means each row #,as it is printed, correpsonds to the to all weights cominging in to output node i. #ensure that we only have layers 1 and 2 b/c RBF will always be of size 3 self.biases = [np.random.randn(self.layer_sizes[2], 1)] # EXECUTE SCRIPT if __name__ == '__main__': print('\nrunning RBFNetwork...\n') data_set_name = 'segmentation' data_api_impl = DataApi('../../data/') data = data_api_impl.get_raw_data_frame(data_set_name) rbf_network_impl = RBFNetwork(data_set_name, [19, 2, 7]) #initializing neuron values rbf_network_impl.init_RBF_Neurons(data[0:2]) #print(rbf_network_impl.rbf_neurons) rbf_network_impl.init_weights_biases() print('weights: %s' % str(rbf_network_impl.weights)) print('biases: %s' % str(rbf_network_impl.biases)) #test_vector = [121.0,60.0,9,0.0,0.0,2.277778,2.329629,2.888889,2.8740742,26.74074,24.666666,35.22222,20.333334,-6.2222223,25.444445,-19.222221,35.22222,0.4223002,-1.776113] #rbf_network_impl.get_forward_output(test_vector) #print(rbf_network_impl.rbf_neurons) #print(data.loc[0,:]) #print(rbf_network_impl.calculate_std(data))
class ExperimentRunner: ''' CONSTRUCTOR ''' def __init__(self): # logger instance - VERBOSE level is highest (most verbose) level for logging self.logger = Logger('DEMO') # configure log level here # datalayer instance - read csv data files and convert into raw data frames self.datalayer = DataApi('../../data/') # preprocessor instance - everything for prerocessing data frames self.preprocessor = Preprocessor() # cross_validator instance - setup cross validation partitions self.cross_validator = CrossValidator() # utils instance - random things self.utils = Utils() # get average result given cross validation results dictionary def get_avg_result(self, cv_results): result_vals = [] # for each cross validation partition, append result value to corresponding list for test_data_key in cv_results: test_result = cv_results[test_data_key] result_vals.append(test_result) # should always equal the value of the 'folds' variable in cross validator test_data_count = len(cv_results) # calculate average values avg_result = sum(result_vals) / test_data_count # return average result return avg_result ''' get preprocessed data ready for consumption by experiment running logic INPUT: - data_set_name: name of data set to fetch data for OUTPUT: - preprocessed data frame - fully ready for experiment consumption ''' def get_experiment_data(self, data_set_name): data = self.datalayer.get_raw_data_frame(data_set_name) self.logger.log('DEMO', 'data_set_name: \t%s\n' % str(data_set_name)) self.logger.log( 'DEMO', 'raw data: \n\n%s, shape: %s\n' % (str(data), str(data.shape))) self.logger.log('DEMO', '----------------------------------------------------' \ + '-----------------------------------------------\n') data = self.preprocessor.preprocess_raw_data_frame(data, data_set_name) self.logger.log( 'DEMO', 'preprocessed data: \n\n%s, shape: %s\n' % (str(data), str(data.shape))) self.logger.log('DEMO', '----------------------------------------------------' \ + '-----------------------------------------------\n') return data ''' run experiment INPUT: - data_set_name: name of data set to run experiment on - neural_network: instance of neural network to train/test with data - hyperparams: hyperparameters and corresponding values to use in experiment OUTPUT: - <void> - logs all the important stuff at DEMO level ''' def run_experiment(self, data_set_name, neural_network, hyperparams): # LAYER ACTIVATION FUNCTION SPECIFICATION self.logger.log( 'DEMO', 'layer_activation_funcs: %s\n' % str(hyperparams["layer_activation_funcs"])) # DATA RETRIEVAL AND PREPROCESSING data = self.get_experiment_data(data_set_name) self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name)) # CROSS VALIDATION PARTITIONING # get cross validation partitions for data cv_partitions = self.cross_validator.get_cv_partitions(data) # dictionary for storing accuracy results cv_results = {} # list of sizes of test sets used for getting average test set size test_data_sizes = [] # NEURAL NETWORK TRAINING AND TESTING for partition in cv_partitions: # initialize key and corresponding nested dictionary in results dictionary test_data_key = 'test_data_' + str(partition) cv_results[test_data_key] = {} # get training set and test set for given cross validation partition train_data, test_data = cv_partitions[partition] test_data_sizes.append( test_data.shape[0] ) # add number of rows in test set to test_set_sizes list # HANDLE RBF NETWORK P2 RESULTS if neural_network.network_name == 'RBF': # configure RBF network shape based on training data neural_network.configure_rbf_network(train_data, data, data_set_name, hyperparams["k"]) # GRADIENT DESCENT # run gradient descent for given neural network instance test_result_vals = neural_network.train_gradient_descent( train_data, hyperparams, partition, test_data) self.logger.log('DEMO', ('accuracy_vals' if neural_network.CLASSIFICATION else 'error_vals') \ + ' for partition %s: %s\n' % (str(partition+1), str(test_result_vals)), True) # append accuracy/error result of final gradient descent iteration to results dictionary cv_results[test_data_key] = test_result_vals[-1] # FINAL RESULTS (THE MODEL) self.logger.log('DEMO', '------------------------------------------------------------' \ + ' TRAINING DONE ------------------------------------------------------------') self.logger.log('DEMO', 'trained network: weights --> \n\n%s, shapes: %s\n' \ % (str(neural_network.weights), str(self.utils.get_shapes(neural_network.weights))), True) self.logger.log('DEMO', 'trained network: biases --> \n\n%s, shapes: %s\n' \ % (str(neural_network.biases), str(self.utils.get_shapes(neural_network.biases))), True) self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name), True) self.logger.log('DEMO', 'trained network: AVERAGE ' \ + ('ACCURACY' if neural_network.CLASSIFICATION else 'ERROR') + ' --> %s\n' \ % str(self.get_avg_result(cv_results)), True)
def __init__(self): self.DEBUG = False self.data_api_impl = DataApi('../../data/')
info = { "PATTERN": "looper", "LOOPER": { "initial_capital": 100000, "margin_ratio": { "rb2010.CTP": 0.00003, }, "commission_ratio": { "rb2010.CTP": { "close": 0.00001 }, }, "size_map": { "rb2010.CTP": 10 } } } app.config.from_mapping(info) strategy = DoubleMaStrategy("ma") data_api = DataApi() data = data_api.get_tick("rb2010", start_date="2020-04-10", end_date="2020-07-21", today=False) # data = data_support.get_future_min("rb2010.SHFE", frq="1min", start="2019-10-01", end="2020-07-15") app.add_data(data) app.add_extension(strategy) app.start() result = app.get_result(report=True, auto_open=True)
class ExperimentRunner(): def __init__(self): self.DEBUG = False # get instances of all the classes needed to run an experiment self.data_api_impl = DataApi('../../data/') self.preprocessor_impl = Preprocessor() self.cross_validator_impl = CrossValidator() self.parameter_tuner_impl = ParameterTuner() # algorithm implementations self.knn_impl = KNN() self.enn_impl = EditedKNN() self.cnn_impl = CondensedKNN() self.kmeans_knn_impl = KMeansClustering() self.k_medoids_clustering_impl = KMedoidsClustering() self.results_processor_impl = Results() self.CLASSIFICATION = False self.REGRESSION = False # run algorithm on data set with various parameters def run_experiment(self, data_frame_name, algorithm): self.set_experiment_type(data_frame_name) # get raw data frame to run experiment against raw_data_frame = self.data_api_impl.get_raw_data_frame(data_frame_name) print(raw_data_frame) # preprocess data preprocessed_data_frame = self.preprocessor_impl.preprocess_raw_data_frame(raw_data_frame, data_frame_name) print(preprocessed_data_frame) # get indexes list for data frame cross validation - a list of row numbers used to partition the data data_frame_indexes_list = self.cross_validator_impl.get_indexes_list(preprocessed_data_frame) if self.DEBUG: print('\ndata_frame_name --> ' + data_frame_name) print('\nraw_data_frame:\n') print(raw_data_frame) print('\npreprocessed_data_frame:\n') print(preprocessed_data_frame) print('\ndata_frame_indexes_list for cross validation:\n') print(data_frame_indexes_list) # nested dictionary to hold algorithm performance results for each combination of training/test sets # key pattern --> key = test_set_1 , where the number at the end of the key is the test set index # each value is another dictionary with keys = { 'zero_one_loss', 'mean_squared_error' } # the nested dictionary values are the corresponding loss function metrics for predictions using the test set cross_validation_results = {} # list of sizes of test sets used for getting average test set size test_set_sizes = [] algorithm_parameters = self.parameter_tuner_impl.get_params(data_frame_name, algorithm) # dictionary where key is parameter and value is tuple of average loss function results results_by_parameter = {} # get all cross validation partitions for given data frame cv_partitions = self.cross_validator_impl.get_cv_partitions(preprocessed_data_frame) # for each parameter value in the list of algorithm parameter values (see ParameterTuner) for parameter in algorithm_parameters: if self.DEBUG: print('\n' + str(self.parameter_tuner_impl.get_parameter_key(algorithm)) + ': ' + str(parameter) + '\n') # for each test set used in cross validation (number of folds) for partition in cv_partitions: # initialize key and corresponding nested dictionary in results dictionary test_set_key = 'test_set_' + str(partition) cross_validation_results[test_set_key] = {} # get training set and test set for given cross validation partition training_set, test_set = cv_partitions[partition] test_set_sizes.append(test_set.shape[0]) # add number of rows in test set to test_set_sizes list if self.DEBUG: print('preprocessed dataframe before running algorithm:') print(preprocessed_data_frame) # run algorithms on training set / test set combination # returns dictionary where key is the row index (as string) and value is the predicted class for that row prediction_results = self.run_algorithm(data_frame_name, algorithm, training_set, test_set, \ preprocessed_data_frame, parameter) # calculate loss function results given prediction results - measure prediction accuracy accuracy, mean_squared_error = self.results_processor_impl.loss_function_analysis(test_set, prediction_results) cross_validation_results[test_set_key]['accuracy'] = accuracy cross_validation_results[test_set_key]['mean_squared_error'] = mean_squared_error # calculate average loss function results over all cross validation folds avg_accuracy, avg_mean_squared_error = self.results_processor_impl.get_avg_loss_vals(cross_validation_results) avg_test_set_size = sum(test_set_sizes) / len(test_set_sizes) # get average test set size for reference results_by_parameter[str(parameter)] = (avg_accuracy, avg_mean_squared_error) print('\n\nRESULTS: average test set size: ' + str(avg_test_set_size) + \ ((' --> accuracy: ' + str(avg_accuracy)) if self.CLASSIFICATION \ else (' --> mean_squared_error: ' + str(avg_mean_squared_error)))) print('\n---------------------------------------------------------------------------------------------------------------------') # return dictionary of results by parameter return results_by_parameter def set_experiment_type(self, data_frame_name): if data_frame_name in ['abalone', 'car', 'segmentation']: self.CLASSIFICATION = True self.REGRESSION = False elif data_frame_name in ['machine', 'forestfires', 'wine']: self.REGRESSION = True self.CLASSIFICATION = False else: raise Exception('ERROR: unknown data_set_name --> ' + str(data_frame_name)) ''' run algorithm execution handler given algorithm name INPUT: - algorithm_name: name of algorithm to run handler for OUTPUT: - prediction results dictionary, maps instance index to tuple: (prediction, actual) ''' def run_algorithm(self, data_set_name, algorithm_name, training_set, \ test_set, preprocessed_data_frame, parameter): if algorithm == 'knn': self.knn_impl.set_data_set(data_set_name) self.knn_impl.set_algorithm_name(algorithm_name) return self.knn_impl.do_knn(training_set, test_set, preprocessed_data_frame, parameter) elif algorithm == 'enn': self.enn_impl.set_data_set(data_set_name) self.enn_impl.set_algorithm_name(algorithm_name) return self.enn_impl.do_enn(training_set, test_set, preprocessed_data_frame, parameter) elif algorithm == 'cnn': self.cnn_impl.set_data_set(data_set_name) self.cnn_impl.set_algorithm_name(algorithm_name) return self.cnn_impl.do_cnn(training_set, test_set, preprocessed_data_frame, parameter) elif algorithm == 'kmeans_knn': self.kmeans_knn_impl.set_data_set(data_set_name) self.kmeans_knn_impl.set_algorithm_name(algorithm_name) return self.kmeans_knn_impl.cluster_do_knn(training_set, test_set, preprocessed_data_frame, data_set_name, parameter) elif algorithm == 'kmedoids_knn': self.k_medoids_clustering_impl.set_data_set(data_set_name) self.k_medoids_clustering_impl.set_algorithm_name(algorithm_name) return self.k_medoids_clustering_impl.cluster(training_set, test_set, preprocessed_data_frame, data_set_name, parameter)
def __init__(self): self.DEBUG = False self.data_api_impl = DataApi('../../data/') self.utilities_impl = Utilities()
print("Number of Previous Edits: ") print(number_of_edits_previous) loopcounter += 1 print("Number of While Loops: ") return edited_train_set.reset_index(drop=True) # EXECUTE SCRIPT if __name__ == '__main__': print('running edited knn...') edited_knn = EditedKNN() data_api_impl = DataApi('../../data/') cross_validator_impl = CrossValidator() preprocessor_impl = Preprocessor() wine_data = data_api_impl.get_raw_data_frame('segmentation') prep_wine_data = preprocessor_impl.preprocess_raw_data_frame( wine_data, 'segmentation') wine_data_train_set = cross_validator_impl.get_training_set( prep_wine_data, test_set_number=3) print('wine_data_train_set.shape: ' + str(wine_data_train_set.shape)) wine_data_test_set = cross_validator_impl.get_test_set( prep_wine_data, test_set_number, indexes_list) edited_knn.enn(wine_data_train_set, wine_data_test_set, prep_wine_data, k)
def __init__(self): KNN.__init__(self) self.DEBUG = False self.data_api_impl = DataApi('../../data/') self.enn_impl = EditedKNN() self.cnn_impl = CondensedKNN()
import base64 from data_api import DataApi from flask import Flask, url_for from flask_cors import CORS from flask_restful import Api, Resource from model_api import ExplanationModel from PIL import Image from flask_restful import reqparse from attribute_chunker import CounterFactualGenerator app = Flask(__name__) api = Api(app) CORS(app) data_api = DataApi() explanation_model = ExplanationModel() cf_gen = CounterFactualGenerator() def any_response(data): ALLOWED = ["http://localhost:8888"] response = make_response(data) origin = request.headers["Origin"] if origin in ALLOWED: response.headers["Access-Control-Allow-Origin"] = origin return response def npimg2base64(img): pil_img = Image.fromarray(img)
class DataApiTests(unittest.TestCase): # SETUP @classmethod def setUpClass(self): self.DEBUG = False self.METRICS = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../../data/') @classmethod def tearDownClass(self): pass # TESTS # test abalone data retrieval, number of rows/columns def test_get_abalone_data(self): abalone_data = self.data_api_impl.get_raw_data_frame('abalone') self.assertTrue(abalone_data is not None) self.assertTrue( abalone_data.shape[0] == 4177) # 4177 rows in abalone data matrix self.assertTrue(abalone_data.shape[1] == 9) # 9 attribute columns in abalone data matrix # test car data retrieval, number of rows/columns def test_get_car_data(self): car_data = self.data_api_impl.get_raw_data_frame('car') self.assertTrue(car_data is not None) self.assertTrue( car_data.shape[0] == 1728) # 1728 rows in car data matrix self.assertTrue( car_data.shape[1] == 7) # 7 attribute columns in car data matrix # test forestfires data retrieval, number of rows/columns def test_get_forestfires_data(self): forestfires_data = self.data_api_impl.get_raw_data_frame('forestfires') self.assertTrue(forestfires_data is not None) self.assertTrue(forestfires_data.shape[0] == 518) # 518 rows in forestfires data matrix self.assertTrue(forestfires_data.shape[1] == 13) # 13 attribute columns in forestfires data matrix # test machine data retrieval, number of rows/columns def test_get_machine_data(self): machine_data = self.data_api_impl.get_raw_data_frame('machine') self.assertTrue(machine_data is not None) self.assertTrue( machine_data.shape[0] == 209) # 209 rows in machine data matrix self.assertTrue(machine_data.shape[1] == 10) # 10 attribute columns in machine data matrix # test segmentation data retrieval, number of rows/columns def test_get_segmentation_data(self): segmentation_data = self.data_api_impl.get_raw_data_frame( 'segmentation') self.assertTrue(segmentation_data is not None) self.assertTrue(segmentation_data.shape[0] == 213) # 213 rows in segmentation data matrix self.assertTrue(segmentation_data.shape[1] == 20) # 20 attribute columns in segmentation data matrix # test wine data retrieval, number of rows/columns def test_get_wine_data(self): wine_data = self.data_api_impl.get_raw_data_frame('wine') self.assertTrue(wine_data is not None) self.assertTrue( wine_data.shape[0] == 6497) # 6497 rows in wine data matrix self.assertTrue(wine_data.shape[1] == 12) # 12 attribute columns in wine data matrix
class CrossValidatorTests(unittest.TestCase): # SETUP @classmethod def setUpClass(self): self.DEBUG = False self.METRICS = False self.data_api_impl = DataApi('../../../data/') self.cross_validator_impl = CrossValidator() self.preprocessor_impl = Preprocessor() @classmethod def tearDownClass(self): pass # TESTS ''' # test get indexes list for abalone data def test_get_indexes_list_abalone_data(self): abalone_data = self.data_api_impl.get_raw_data_frame('abalone') self.assertTrue(abalone_data is not None) abalone_indexes = self.cross_validator_impl.get_indexes_list(abalone_data) self.assertTrue(len(abalone_indexes) == 4177) # 4177 rows in abalone data frame for i in range(1, 10): self.assertTrue(abalone_indexes.count(i) == 417) # each subset has 417 rows self.assertTrue(abalone_indexes.count(10) == 424) # last subset has 417 + remaining... # test get indexes list for car data def test_get_indexes_list_car_data(self): car_data = self.data_api_impl.get_raw_data_frame('car') self.assertTrue(car_data is not None) car_indexes = self.cross_validator_impl.get_indexes_list(car_data) self.assertTrue(len(car_indexes) == 1728) # 1728 rows in car data frame for i in range(1, 10): self.assertTrue(car_indexes.count(i) == 172) # each subset has 172 rows self.assertTrue(car_indexes.count(10) == 180) # last subset has 172 + remaining... # test get indexes list for forest fires data def test_get_indexes_list_ff_data(self): ff_data = self.data_api_impl.get_raw_data_frame('forestfires') self.assertTrue(ff_data is not None) ff_indexes = self.cross_validator_impl.get_indexes_list(ff_data) self.assertTrue(len(ff_indexes) == 518) # 518 rows in forest fires data frame for i in range(1, 10): self.assertTrue(ff_indexes.count(i) == 51) # each subset has 51 rows self.assertTrue(ff_indexes.count(10) == 59) # last subset has 51 + remaining... # test get indexes list for machine data def test_get_indexes_list_machine_data(self): machine_data = self.data_api_impl.get_raw_data_frame('machine') self.assertTrue(machine_data is not None) machine_indexes = self.cross_validator_impl.get_indexes_list(machine_data) self.assertTrue(len(machine_indexes) == 209) # 209 rows in machine data frame for i in range(1, 10): self.assertTrue(machine_indexes.count(i) == 20) # each subset has 20 rows self.assertTrue(machine_indexes.count(10) == 29) # last subset has 20 + remaining... # test get indexes list for segmentation data def test_get_indexes_list_segmentation_data(self): segmentation_data = self.data_api_impl.get_raw_data_frame('segmentation') self.assertTrue(segmentation_data is not None) segmentation_indexes = self.cross_validator_impl.get_indexes_list(segmentation_data) self.assertTrue(len(segmentation_indexes) == 213) # 213 rows in segmentation data frame for i in range(1, 10): self.assertTrue(segmentation_indexes.count(i) == 21) # each subset has 21 rows self.assertTrue(segmentation_indexes.count(10) == 24) # last subset has 21 + remaining... # test get indexes list for wine data def test_get_indexes_list_wine_data(self): wine_data = self.data_api_impl.get_raw_data_frame('wine') self.assertTrue(wine_data is not None) wine_indexes = self.cross_validator_impl.get_indexes_list(wine_data) self.assertTrue(len(wine_indexes) == 6497) # 6497 rows in wine data frame for i in range(1, 10): self.assertTrue(wine_indexes.count(i) == 649) # each subset has 649 rows self.assertTrue(wine_indexes.count(10) == 656) # last subset has 649 + remaining... # TRAINING SET # test get training set 2 with wine data def test_get_training_set(self): wine_data = self.data_api_impl.get_raw_data_frame('wine') wine_data_training_set = self.cross_validator_impl.get_training_set(wine_data, 2) self.assertTrue(wine_data_training_set.shape[0] == 5848) # 6497 - 649 rows in test set 2 means 5484 rows in training set self.assertTrue(wine_data_training_set.shape[1] == 12) # number of columns does not change # TEST SET # test get test set (-2) with wine data def test_get_test_set(self): wine_data = self.data_api_impl.get_raw_data_frame('wine') wine_data_test_set = self.cross_validator_impl.get_test_set(wine_data, 2) self.assertTrue(wine_data_test_set.shape[0] == 649) # 649 rows in test set 2 self.assertTrue(wine_data_test_set.shape[1] == 12) # number of columns does not change ''' def test_cv_partitions(self): abalone_data = self.data_api_impl.get_raw_data_frame('abalone') prep_abalone_data = self.preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone') cv_partitions = self.cross_validator_impl.get_cv_partitions(prep_abalone_data) self.assertTrue(cv_partitions is not None) for partition in cv_partitions: train_data_indexes = list(cv_partitions[partition][0].index.values) test_data_indexes = list(cv_partitions[partition][1].index.values) for test_index in test_data_indexes: self.assertTrue(test_index not in train_data_indexes)
centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data) # return a dictionary where key is the instance index and value is a tuple: (prediction, actual) print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count)) return centroids_data # EXECUTE SCRIPT if __name__ == '__main__': print('k means clustering...') k_means_clustering_impl = KMeansClustering() data_api_impl = DataApi('../../data/') preprocessor_impl = Preprocessor() cross_validator_impl = CrossValidator() ''' wine_data = data_api_impl.get_raw_data_frame('wine') prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(wine_data, 'wine') ''' abalone_data = data_api_impl.get_raw_data_frame('abalone') prep_abalone_data = preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone') print('\npossible classes: ' + str(list(set(abalone_data.loc[:, 'CLASS'].values))) + '\n') training_set, test_set = cross_validator_impl.get_cv_partitions(prep_abalone_data)[0]
trading_record: 格式为 [(1,datetime1)] 1: 开多 2: 开空 -1: 平多 -2: 平空 """ self.data.setdefault(local_symbol, {})["record"] = trading_record self.data.setdefault(local_symbol, {})["kline"] = [[ str(kline.datetime), kline.open_price, kline.high_price, kline.low_price, kline.close_price, kline.volume ] for kline in klines] def render(self, path): for local_symbol, obj in self.data.items(): with open(path, "w") as f: print(obj) kline_string = kline_template.render(draw_klines=obj["kline"], bs=obj["record"]) f.write(kline_string) if __name__ == '__main__': plot = Plot("some") from data_api import DataApi code = "rb2105.SHFE" data_api = DataApi(uri="http://192.168.1.239:8124") kline = data_api.get_n_min_bar(code, 1, "2021-04-15", "2021-04-16") plot.add_kline(code, klines=kline, trading_record=[]) plot.render("x.html")
class KMeansClustering(KNN): def __init__(self): KNN.__init__(self) self.DEBUG = True self.VERBOSE = False self.data_api_impl = DataApi('../../data/') self.utilities_impl = Utilities() self.distance_functions_impl = DistanceFunctions() # threshold for clustering convergence # stop iterating when differences between consecutive centroids is smaller than this self.CONVERGENCE_THRESHOLD = 0.25 # maximum clustering iterations allowed before returning answer self.MAX_ITERATIONS = 5 #self.MAX_CLUSTER_TIME = 5 # minutes ''' perform k-means-clustering against full_data_frame using k value as parameter INPUT: - data_set_name: name of data set to cluster - train_data: training data set to cluster - k: value for parameter k, i.e. the number of clusters to partition the data set into OUTPUT: - tuple: - index 1: list of cluster keys representing the cluster each data point belongs to - index 2: centroids dataframe for k centroids (without class values) ''' def cluster(self, data_set_name, train_data, k): print('\nk means clustering with k: ' + str(k)) # get list of column labels for data set, not including the CLASS column label data_column_labels = self.data_api_impl.get_column_labels(data_set_name, include_class=False) # get training data without class column train_data = train_data.loc[:, train_data.columns != 'CLASS'] # randomly generate k initial centroids from training data centroids = self.generate_initial_centroids(train_data, k) #print('generated initial centroids') if self.DEBUG: print('centroids:') print(centroids) if not isinstance(centroids, pd.DataFrame): # convert list of centroids to data frame using same column labels centroids_df = pd.DataFrame.from_records(centroids, columns=data_column_labels) #print('created centroids_df from records from centroids') else: centroids_df = centroids # combine the centroids_df and train_data dataframes into one frame with centroids first centroids_and_data_df = centroids_df.append(train_data, ignore_index=True) #print('created centroids_and_data_df') # get distance matrix - distance from every training point to every centroid distance_from_centroids = self.get_distance_matrix(centroids_and_data_df) #print('calculated distance matrix for centroids_and_data_df') if self.DEBUG and self.VERBOSE: print('cluster: train_data.shape: ' + str(train_data.shape)) print('cluster: k: ' + str(k)) print('cluster: number of centroids: ' + str(len(centroids))) print('centroids_and_data_df:') print(centroids_and_data_df) print('cluster: centroids_and_data_df.shape: ' + str(centroids_and_data_df.shape)) print('cluster: distance_from_centroids.shape: ' + str(distance_from_centroids.shape)) cluster_assignments = [] iteration_count = 1 # initial centroids previous_centroids = centroids_and_data_df.iloc[:k, :] # set initial centroids diffs to maximum number sizes centroids_diff = [sys.maxsize for i in range(k)] cluster_start_time = time.time() #print('right before clustering...') while not self.threshold_reached(centroids_diff) and iteration_count < self.MAX_ITERATIONS: print('clustering... iteration: ' + str(iteration_count)) new_cluster_assignments = [] # for each training point in the training data (start indexing at k) for instance_idx in range(k, centroids_and_data_df.shape[0]): data_point = centroids_and_data_df.iloc[instance_idx, :] # get list of distances from instance to each centroid idx_distances = np.array(distance_from_centroids[instance_idx][:k]) # get index of centroid with least distance from instance closest_centroid_idx = np.argmin(idx_distances) # map instance index to centroid index (0 - k) new_cluster_assignments.append(closest_centroid_idx) #print('calculated new cluster assignments') cluster_assignments = new_cluster_assignments updated_centroids = [] # update each centroid to mean of all points assigned to that centroid for centroid_idx in range(k): #print('updating mean for centroid: ' + str(centroid_idx)) cluster_points = pd.DataFrame(columns=data_column_labels) np_cluster_assignments = np.array(cluster_assignments) # get list of indexes for all instances assigned to given centroid index idxs_for_cluster_val = np.where(np_cluster_assignments == centroid_idx)[0] for idx in idxs_for_cluster_val: # add point to cluster points dataframe, add k to row index to skip centroid points # NOT EFFICIENT - CHANGE THIS SO IT IS FASTER cluster_points = cluster_points.append(centroids_and_data_df.iloc[idx+k, :]) #print('built cluster points list') if self.DEBUG and self.VERBOSE: print('centroid_idx: ' + str(centroid_idx)) print('idxs_for_cluster_val:' + str(idxs_for_cluster_val)) print('points in cluster ' + str(centroid_idx) + ': ' + str(cluster_points.shape)) print('cluster_points:') print(cluster_points) avg_centroid = None # if there are points that were assigned to the cluster (centroid index) if cluster_points.shape[0] > 0: avg_centroid = self.get_avg_centroid(cluster_points) #print('got average centroid for idx: ' + str(centroid_idx)) if self.DEBUG and self.VERBOSE: print('avg_centroid:') print(avg_centroid) # do not update the centroid if any of the values in the centroid are nan if avg_centroid is not None and not np.isnan(np.array(avg_centroid)).any(): updated_centroids.append(avg_centroid) #print('appended avg_centroid to updated_centroids list') else: if self.DEBUG and self.VERBOSE: print('ERROR: avg_centroid is none!') print('bad cluster points:') print(cluster_points) print('avg_centroid:') print(avg_centroid) # keep previous centroid if the average centroid is still null (no points in cluster) updated_centroids.append(list(previous_centroids.iloc[centroid_idx, :].values)) #print('appended previous centroid to updated_centroids list') # update centroids dataframe using list of updated centroids representing new average centroids updated_centroids_df = pd.DataFrame.from_records(updated_centroids, columns=data_column_labels) #print('created updated_centroids_df from records of updated_centroids') # update centroids in reference data frame (the one that contains the points too) for row_num in range(updated_centroids_df.shape[0]): centroids_and_data_df.iloc[row_num, :] = updated_centroids_df.iloc[row_num, :] #print('updated centroids in reference data frame centroids_and_data_df') # update distance matrix using new centroids for distance calculations distance_from_centroids = self.get_distance_matrix(centroids_and_data_df) #print('updated distance matrix using new centroids') # calculate distance between pairs of previous/new centroids to see if we've satisfied the threshold centroids_diff = self.get_centroids_diff(previous_centroids, updated_centroids_df) #print('calculated centroids diff') # BUG: fix the issue where the first centroid diff is always zero if centroids_diff[0] == 0: #iteration_count = iteration_count - 1 # remove this line? #print('decremented iteration_count to: ' + str(iteration_count) + ', because centroids_diff[0] == 0') #print('centroids_diff[0] == 0 !!!') iteration_count = iteration_count + 1 continue # workaround for now else: print('\nclustering iteration: ' + str(iteration_count + 1)) print('centroids_diff: ' + str(centroids_diff)) # update previous centroids dataframe to updated centroids dataframe previous_centroids = updated_centroids_df #print('updated previous centroids using updated_centroids_df') iteration_count = iteration_count + 1 print('cluster returning: %s, %s, %s' % (str(len(cluster_assignments)), \ str(updated_centroids_df.shape), str(iteration_count))) # return a tuple containing the final list of cluster assignments and the final centroids return (cluster_assignments, updated_centroids_df, iteration_count - 1) ''' generate initial cluster centroids with random values in min/max range for each column INPUT: - data_frame: data to generate centroids for - k: k param value, i.e. number of centroids to generate OUTPUT: - list of centroid points with same dimensionality as regular data points ''' def generate_initial_centroids(self, data_frame, k): ''' # RANDOM GENERATION APPROACH centroids = [] # get min/max values for each column (the bounds of the values for each column) column_bounds = self.utilities_impl.get_column_bounds(data_frame) num_cols = len(column_bounds) for centroid_index in range(k): centroid = [] for col_index in range(num_cols): min_max_bounds = column_bounds[col_index] # randomly generate value in min/max range for each attribute centroid.append(random.uniform(min_max_bounds[0], min_max_bounds[1])) centroids.append(centroid) # return list of centroid points return centroids ''' # RANDOM POINTS APPROACH indexes = random.sample(range(data_frame.shape[0]), k) # BUG: change this so it doesn't throw the pandas error in the log return data_frame.reindex(indexes) # return boolean indicating whether the centroid diff threshold has been reached def threshold_reached(self, centroids_diff): np_diffs = np.array(centroids_diff) # workaround for bug causing centroids_diff to be all zeros if np_diffs is None or np_diffs[0] == 0: return False # return boolean indicating whether any centroid diffs are greater than threshold return not list(np_diffs[np_diffs > self.CONVERGENCE_THRESHOLD]) ''' get average centroid from all points assigned to given cluster INPUT: - cluster_points: dataframe consisting of all points assigned to cluster OUTPUT: - centroid where each column is average value of respective column in cluster_points ''' def get_avg_centroid(self, cluster_points): avg_col_vals = [] # for each column in dataframe representing all points assigned to cluster for column_label, _ in cluster_points.items(): column_vals = cluster_points.loc[:, column_label].values column_vals = [float(val) for val in column_vals] # calculate average column value and append to list avg_col_vals.append(stats.mean(column_vals)) if self.DEBUG and self.VERBOSE: print('column_label: ' + str(column_label)) print('len(column_vals): ' + str(len(column_vals))) print('column_vals: ') print(column_vals) print('avg_column_vals: ' + str(stats.mean(column_vals))) # return average centroid as list of average values for each column return avg_col_vals ''' get diff between centroids from iteration n and iteration n+1 ''' def get_centroids_diff(self, previous_centroids, updated_centroids_df): if self.DEBUG and self.VERBOSE: print('previous_centroids:') print(previous_centroids) print('updated_centroids_df:') print(updated_centroids_df) centroid_diffs = [] # for each centroid (instance) in the previous centroids dataframe for row_num in range(previous_centroids.shape[0]): prev_row = previous_centroids.iloc[row_num, :] updated_row = updated_centroids_df.iloc[row_num, :] # calculate euclidean distance between previous and updated centroid instance diff_dist = self.distance_functions_impl.get_euclidean_distance(prev_row, updated_row) centroid_diffs.append(diff_dist) # return list containing distances between each corresponding pair of centroids return centroid_diffs ''' evaluate clustering - show counts ''' def evaluate_clustering(self, data, clustering_assignments, k): print('\nCLUSTERING EVALUATION:') np_cluster_assignments = np.array(cluster_assignments) for centroid_idx in range(k): print('centroid_idx: ' + str(centroid_idx)) freqs = {} idxs_for_cluster_val = np.where(np_cluster_assignments == centroid_idx)[0] for idx in idxs_for_cluster_val: if idx in data.index: actual_class = str(data.loc[idx, 'CLASS']) if actual_class in freqs: freqs[actual_class] = freqs[actual_class] + 1 else: freqs[actual_class] = 1 else: print('ERROR: ' + str(idx) + ' not in data.index!') print('freqs: ' + str(freqs)) ''' get distance matrix using pdist and squareform methods from scipy.spatial.distance INPUT: - data_frame: data frame we're working with OUTPUT: - distance matrix containing distances between every pair of points in data frame ''' def get_distance_matrix(self, data_frame): # get distance matrix (upper triangle) using distance metric distances = pdist(data_frame.values, metric='euclidean') # fill in lower triangle maintaining symmetry dist_matrix = squareform(distances) # return full distance matrix return dist_matrix ''' method for getting k cluster centroids using clustering output from cluster() method above the input 'centroids_data' is a dataframe that contains all the attribute value for the centroids this method is responsible for appending the corresponding class values to each centroid instance INPUT: - cluster_assignments - list of clustering assignments - centroids_data - centroids rows without class values OUTPUT: - data frame with k rows, representing k centroids ''' def get_cluster_centroids(self, cluster_assignments, centroids_data, dataframe): if self.DEBUG and self.VERBOSE: print('get_cluster_centroids: unique cluster assignments: ' + str(set(cluster_assignments))) print('get_cluster_centroids: centroids_data.shape - BEFORE: ' + str(centroids_data.shape)) print('get_cluster_centroids: dataframe.shape: ' + str(dataframe.shape)) print('get_cluster_centroids: dataframe:') print(dataframe) # convert list of cluster assignments to np array to utilize np methods np_cluster_assignments = np.array(cluster_assignments) centroid_class_vals = [] #print('len set cluster assignments: ' + str(len(set(cluster_assignments)))) # for each unique cluster value (centroid index) that data points were assigned to for unique_cluster_val in set(cluster_assignments): # get dataframe row indexes that were assigned to the cluster val_idxs = np.where(np_cluster_assignments == unique_cluster_val)[0] #print('get_cluster_centroids: val_idxs assigned to unique_cluster_val: ' + str(unique_cluster_val) + ' --> ' + str(val_idxs)) idx_class_vals = self.get_idx_class_vals(dataframe, val_idxs) #print('get_cluster_centroids: idx_class_vals for unique_cluster_val: ' + str(unique_cluster_val) + ' --> ' + str(idx_class_vals)) highest_freq_class = self.utilities_impl.get_mode(idx_class_vals) print('highest_freq_class: ' + str(highest_freq_class)) # append highest frequency class to list of centroid class values centroid_class_vals.append(highest_freq_class) #print('len centroid_class_vals: ' + str(len(centroid_class_vals))) #print('get_cluster_centroids: centroid_class_vals: ' + str(centroid_class_vals)) # these values will not match if there were clusters that had no points assigned to them if len(centroid_class_vals) != centroids_data.shape[0]: # get list of all possible class values for given dataframe poss_class_vals = list(set(dataframe.loc[:, 'CLASS'].values)) # randomly assign class values to missing clusters to make dimensions match # BUG: this shouldn't be necessary, handle missing clusters in a better way centroid_class_vals = self.handle_cluster_count_mismatch(\ centroid_class_vals, centroids_data.shape[0], poss_class_vals) # append generated class values column to centroids dataframe (assigning class to each centroid) centroids_data['CLASS'] = centroid_class_vals if self.DEBUG and self.VERBOSE: print('get_cluster_centroids: centroids_data.shape - AFTER: ' + str(centroids_data.shape)) print('get_cluster_centroids: centroids_data - AFTER:') print(centroids_data) # return complete centroids dataframe (now containing the corresponding class values for each centroid) return centroids_data # get class values from dataframe for all indexes in idxs arg def get_idx_class_vals(self, dataframe, idxs): class_vals = [] for idx in idxs: row_data = dataframe.iloc[idx, :] class_vals.append(row_data['CLASS']) # return list of class values for idxs return class_vals # this shouldn't be necessary, but for now workaround by returning random class value def handle_cluster_count_mismatch(self, centroid_class_vals, expected_centroid_count, poss_class_vals): while len(centroid_class_vals) < expected_centroid_count: # BUG: this shouldn't be necessary, for now return random class value for missing clusters centroid_class_vals.append(random.choice(poss_class_vals)) assert len(centroid_class_vals) == expected_centroid_count # return list of centroid class values with expected length return centroid_class_vals ''' do full knn run through using k means clustering output as reduced data set for knn NOTE: this always sets k equal to the number of possible class values for the given data set INPUT: - train_data: training data that will be clustered - test_data: test data - dataframe: full dataframe - data_name: name of data set we're using - k: value for k parameter OUTPUT: - returns a dictionary where key is the instance index and value is a tuple: (prediction, actual) ''' def cluster_do_knn(self, train_data, test_data, dataframe, data_name, k): # get number of possible class values for given data frame num_poss_class_vals = len(set(dataframe.loc[:, 'CLASS'].values)) # k means cluster the training data to get list of final cluster assignments and resulting centroids cluster_assignments, centroids_data, iteration_count = self.cluster(data_name, train_data, k=num_poss_class_vals) ''' would be cool if we did some cluster evaluation but this method isn't working for some reason... k_means_clustering_impl.evaluate_clustering(training_set, cluster_assignments, k=5) ''' # get resulting centroids using highest frequency class value for each set of points in clusters centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data) # return a dictionary where key is the instance index and value is a tuple: (prediction, actual) print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count)) return self.do_knn(centroids_data, test_data, dataframe, k) ''' get cluster centroids for use in RBF network setup INPUT: - train_data: training data that will be clustered - dataframe: full dataframe - data_name: name of data set we're using - k: value of k parameter in base knn OUTPUT: - return cluster centroids for given training data and k value ''' def get_centroids_for_rbf_network(self, train_data, dataframe, data_name, k): # get number of possible class values for given data frame num_poss_class_vals = len(set(dataframe.loc[:, 'CLASS'].values)) # k means cluster the training data to get list of final cluster assignments and resulting centroids cluster_assignments, centroids_data, iteration_count = self.cluster(data_name, train_data, k=num_poss_class_vals) # get resulting centroids using highest frequency class value for each set of points in clusters centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data) # return a dictionary where key is the instance index and value is a tuple: (prediction, actual) print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count)) return centroids_data
accuracy_vals = self.filter_vals(accuracy_vals) mean_squared_error_vals = self.filter_vals(mean_squared_error_vals) # calculate average values avg_accuracy = sum(accuracy_vals) / test_set_count avg_mean_squared_error = sum(mean_squared_error_vals) / test_set_count # return tuple with average values for zero_one_loss and mean_squared_error return (avg_accuracy, avg_mean_squared_error) def filter_vals(self, vals): filtered_vals = [] for val in vals: if val is not None: filtered_vals.append(val) return filtered_vals # EXECUTE SCRIPT if __name__ == "__main__": print('running results...') results_impl = Results() data_api_impl = DataApi('../../data/') wine_data = data_api_impl.get_raw_data_frame('wine')
def __init__(self): self.DEBUG = False # construct DataApi instance with path prefix to data directory (relative from here) self.data_api_impl = DataApi('../../data/')
def __init__(self): KNN.__init__(self) self.DEBUG = True self.data_api_impl = DataApi('../../data/')
#Then scales to a 0-1 value (For comparison) for i, valuetype in enumerate(uniquevals): uniquevals[i] = i / (len(uniquevals) - 1) normalized = attribute.replace(attribute.unique(), uniquevals) return normalized # EXECUTE SCRIPT if __name__ == '__main__': print('running preprocessor...') preprocessor_impl = Preprocessor() data_api_impl = DataApi('../../data/') ''' raw_abalone_data = data_api_impl.get_raw_data_frame('abalone') print('raw_abalone_data:') print(raw_abalone_data) prep_abalone_data = preprocessor_impl.preprocess_raw_data_frame(raw_abalone_data, 'abalone') print('prep_abalone_data:') print(prep_abalone_data) ''' ''' raw_car_data = data_api_impl.get_raw_data_frame('car') print('raw_car_data:') print(raw_car_data) prep_car_data = preprocessor_impl.preprocess_raw_data_frame(raw_car_data, 'car') print('prep_car_data:') print(prep_car_data)