Esempio n. 1
0
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		# construct DataApi instance with path prefix to data directory (relative from here)
		self.data_api_impl = DataApi('../../../data/')

		self.distance_functions_impl = DistanceFunctions()
    def __init__(self):
        # logger instance - VERBOSE level is highest (most verbose) level for logging
        self.logger = Logger('DEMO')  # configure log level here

        # datalayer instance - read csv data files and convert into raw data frames
        self.datalayer = DataApi('../../data/')
        # preprocessor instance - everything for prerocessing data frames
        self.preprocessor = Preprocessor()
        # cross_validator instance - setup cross validation partitions
        self.cross_validator = CrossValidator()
        # utils instance - random things
        self.utils = Utils()
    def __init__(self):
        KNN.__init__(self)
        self.DEBUG = True
        self.VERBOSE = False
        self.data_api_impl = DataApi('../../data/')
        self.utilities_impl = Utilities()
        self.distance_functions_impl = DistanceFunctions()

        # threshold for clustering convergence
        # stop iterating when differences between consecutive centroids is smaller than this
        self.CONVERGENCE_THRESHOLD = 0.25
        # maximum clustering iterations allowed before returning answer
        self.MAX_ITERATIONS = 5
class ViewData():
    def __init__(self):
        self.DEBUG = False
        # construct DataApi instance with path prefix to data directory (relative from here)
        self.data_api_impl = DataApi('../../data/')

    def view_iris_data(self):
        print('\nIRIS DATA:\n')
        print(self.data_api_impl.get_iris_data())
    def __init__(self):
        self.DEBUG = True
        self.VERBOSE = False

        self.data_api_impl = DataApi('../../../data/')
        self.data_set = None

        self.CLASSIFICATION = True
        self.REGRESSION = False

        self.algorithm_name = None
Esempio n. 7
0
    def __init__(self):
        self.DEBUG = False

        # get instances of all the classes needed to run an experiment
        self.data_api_impl = DataApi('../../data/')
        self.preprocessor_impl = Preprocessor()
        self.cross_validator_impl = CrossValidator()
        self.parameter_tuner_impl = ParameterTuner()

        # algorithm implementations
        self.knn_impl = KNN()
        self.enn_impl = EditedKNN()
        self.cnn_impl = CondensedKNN()
        self.kmeans_knn_impl = KMeansClustering()
        self.k_medoids_clustering_impl = KMedoidsClustering()

        self.results_processor_impl = Results()

        self.CLASSIFICATION = False
        self.REGRESSION = False
Esempio n. 8
0
class ViewData():
    def __init__(self):
        self.DEBUG = False
        # construct DataApi instance with path prefix to data directory (relative from here)
        self.data_api_impl = DataApi('../../data/')

    def view_abalone_data(self):
        print('\nABALONE DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('abalone'))

    def view_car_data(self):
        print('\nCAR DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('car'))

    def view_forestfires_data(self):
        print('\nFORESTFIRES DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('forestfires'))

    def view_machine_data(self):
        print('\nMACHINE DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('machine'))

    def view_segmentation_data(self):
        print('\nSEGMENTATION DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('segmentation'))

    def view_wine_data(self):
        print('\nWINE DATA:\n')
        print(self.data_api_impl.get_raw_data_frame('wine'))
class DistanceFunctionsTests(unittest.TestCase):


	# SETUP
	

	@classmethod
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		# construct DataApi instance with path prefix to data directory (relative from here)
		self.data_api_impl = DataApi('../../../data/')

		self.distance_functions_impl = DistanceFunctions()


	@classmethod
	def tearDownClass(self):
		pass
		

	# TESTS


	# test get manhattan distance
	def test_get_manhattan_distance(self):
		pass


	# test get euclidean distance
	def test_get_euclidean_distance(self):
		abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
		self.assertTrue(abalone_data is not None)

		distance_1_2 = self.distance_functions_impl.get_euclidean_distance(abalone_data[0,:], abalone_data[1,:])
		print('distance_1_2: ' + str(distance_1_2))
Esempio n. 10
0
        self.weights = [
            np.random.randn(self.layer_sizes[2], self.layer_sizes[1])
        ]
        # the first node in every list correpsonds to the the 1 output from top to bottom. Print-wise, this means each row
        #,as it is printed, correpsonds to the to all weights cominging in to output node i.
        #ensure that we only have layers 1 and 2 b/c RBF will always be of size 3
        self.biases = [np.random.randn(self.layer_sizes[2], 1)]


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('\nrunning RBFNetwork...\n')
    data_set_name = 'segmentation'
    data_api_impl = DataApi('../../data/')
    data = data_api_impl.get_raw_data_frame(data_set_name)
    rbf_network_impl = RBFNetwork(data_set_name, [19, 2, 7])
    #initializing neuron values
    rbf_network_impl.init_RBF_Neurons(data[0:2])
    #print(rbf_network_impl.rbf_neurons)
    rbf_network_impl.init_weights_biases()

    print('weights: %s' % str(rbf_network_impl.weights))
    print('biases: %s' % str(rbf_network_impl.biases))
    #test_vector = [121.0,60.0,9,0.0,0.0,2.277778,2.329629,2.888889,2.8740742,26.74074,24.666666,35.22222,20.333334,-6.2222223,25.444445,-19.222221,35.22222,0.4223002,-1.776113]
    #rbf_network_impl.get_forward_output(test_vector)

    #print(rbf_network_impl.rbf_neurons)
    #print(data.loc[0,:])
    #print(rbf_network_impl.calculate_std(data))
class ExperimentRunner:
    '''
    CONSTRUCTOR
    '''
    def __init__(self):
        # logger instance - VERBOSE level is highest (most verbose) level for logging
        self.logger = Logger('DEMO')  # configure log level here

        # datalayer instance - read csv data files and convert into raw data frames
        self.datalayer = DataApi('../../data/')
        # preprocessor instance - everything for prerocessing data frames
        self.preprocessor = Preprocessor()
        # cross_validator instance - setup cross validation partitions
        self.cross_validator = CrossValidator()
        # utils instance - random things
        self.utils = Utils()

    # get average result given cross validation results dictionary
    def get_avg_result(self, cv_results):
        result_vals = []
        # for each cross validation partition, append result value to corresponding list
        for test_data_key in cv_results:
            test_result = cv_results[test_data_key]
            result_vals.append(test_result)

        # should always equal the value of the 'folds' variable in cross validator
        test_data_count = len(cv_results)
        # calculate average values
        avg_result = sum(result_vals) / test_data_count
        # return average result
        return avg_result

    '''
    get preprocessed data ready for consumption by experiment running logic

    INPUT:
        - data_set_name: name of data set to fetch data for

    OUTPUT:
        - preprocessed data frame - fully ready for experiment consumption
    '''

    def get_experiment_data(self, data_set_name):
        data = self.datalayer.get_raw_data_frame(data_set_name)
        self.logger.log('DEMO', 'data_set_name: \t%s\n' % str(data_set_name))
        self.logger.log(
            'DEMO',
            'raw data: \n\n%s, shape: %s\n' % (str(data), str(data.shape)))
        self.logger.log('DEMO', '----------------------------------------------------' \
                                    + '-----------------------------------------------\n')
        data = self.preprocessor.preprocess_raw_data_frame(data, data_set_name)
        self.logger.log(
            'DEMO', 'preprocessed data: \n\n%s, shape: %s\n' %
            (str(data), str(data.shape)))
        self.logger.log('DEMO', '----------------------------------------------------' \
                                    + '-----------------------------------------------\n')
        return data

    '''
    run experiment

    INPUT:
        - data_set_name: name of data set to run experiment on
        - neural_network: instance of neural network to train/test with data
        - hyperparams: hyperparameters and corresponding values to use in experiment

    OUTPUT:
        - <void> - logs all the important stuff at DEMO level
    '''

    def run_experiment(self, data_set_name, neural_network, hyperparams):

        # LAYER ACTIVATION FUNCTION SPECIFICATION

        self.logger.log(
            'DEMO', 'layer_activation_funcs: %s\n' %
            str(hyperparams["layer_activation_funcs"]))

        # DATA RETRIEVAL AND PREPROCESSING

        data = self.get_experiment_data(data_set_name)

        self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name))

        # CROSS VALIDATION PARTITIONING

        # get cross validation partitions for data
        cv_partitions = self.cross_validator.get_cv_partitions(data)

        # dictionary for storing accuracy results
        cv_results = {}
        # list of sizes of test sets used for getting average test set size
        test_data_sizes = []

        # NEURAL NETWORK TRAINING AND TESTING

        for partition in cv_partitions:
            # initialize key and corresponding nested dictionary in results dictionary
            test_data_key = 'test_data_' + str(partition)
            cv_results[test_data_key] = {}
            # get training set and test set for given cross validation partition
            train_data, test_data = cv_partitions[partition]
            test_data_sizes.append(
                test_data.shape[0]
            )  # add number of rows in test set to test_set_sizes list

            # HANDLE RBF NETWORK P2 RESULTS

            if neural_network.network_name == 'RBF':
                # configure RBF network shape based on training data
                neural_network.configure_rbf_network(train_data, data,
                                                     data_set_name,
                                                     hyperparams["k"])

            # GRADIENT DESCENT

            # run gradient descent for given neural network instance
            test_result_vals = neural_network.train_gradient_descent(
                train_data, hyperparams, partition, test_data)

            self.logger.log('DEMO', ('accuracy_vals' if neural_network.CLASSIFICATION else 'error_vals') \
                + ' for partition %s: %s\n' % (str(partition+1), str(test_result_vals)), True)

            # append accuracy/error result of final gradient descent iteration to results dictionary
            cv_results[test_data_key] = test_result_vals[-1]

        # FINAL RESULTS (THE MODEL)

        self.logger.log('DEMO', '------------------------------------------------------------' \
                + ' TRAINING DONE ------------------------------------------------------------')

        self.logger.log('DEMO', 'trained network: weights --> \n\n%s, shapes: %s\n' \
            % (str(neural_network.weights), str(self.utils.get_shapes(neural_network.weights))), True)

        self.logger.log('DEMO', 'trained network: biases --> \n\n%s, shapes: %s\n' \
            % (str(neural_network.biases), str(self.utils.get_shapes(neural_network.biases))), True)

        self.logger.log('DEMO', 'data_set_name: %s\n' % str(data_set_name),
                        True)

        self.logger.log('DEMO', 'trained network: AVERAGE ' \
            + ('ACCURACY' if neural_network.CLASSIFICATION else 'ERROR') + ' --> %s\n' \
            % str(self.get_avg_result(cv_results)), True)
 def __init__(self):
     self.DEBUG = False
     self.data_api_impl = DataApi('../../data/')
Esempio n. 13
0
    info = {
        "PATTERN": "looper",
        "LOOPER": {
            "initial_capital": 100000,
            "margin_ratio": {
                "rb2010.CTP": 0.00003,
            },
            "commission_ratio": {
                "rb2010.CTP": {
                    "close": 0.00001
                },
            },
            "size_map": {
                "rb2010.CTP": 10
            }
        }
    }
    app.config.from_mapping(info)
    strategy = DoubleMaStrategy("ma")

    data_api = DataApi()
    data = data_api.get_tick("rb2010",
                             start_date="2020-04-10",
                             end_date="2020-07-21",
                             today=False)
    # data = data_support.get_future_min("rb2010.SHFE", frq="1min", start="2019-10-01", end="2020-07-15")
    app.add_data(data)
    app.add_extension(strategy)
    app.start()
    result = app.get_result(report=True, auto_open=True)
Esempio n. 14
0
class ExperimentRunner():


    def __init__(self):
        self.DEBUG = False

        # get instances of all the classes needed to run an experiment
        self.data_api_impl = DataApi('../../data/')
        self.preprocessor_impl = Preprocessor()
        self.cross_validator_impl = CrossValidator()
        self.parameter_tuner_impl = ParameterTuner()

        # algorithm implementations
        self.knn_impl = KNN()
        self.enn_impl = EditedKNN()
        self.cnn_impl = CondensedKNN()
        self.kmeans_knn_impl = KMeansClustering()
        self.k_medoids_clustering_impl = KMedoidsClustering()

        self.results_processor_impl = Results()

        self.CLASSIFICATION = False
        self.REGRESSION = False


    # run algorithm on data set with various parameters
    def run_experiment(self, data_frame_name, algorithm):

        self.set_experiment_type(data_frame_name)

        # get raw data frame to run experiment against
        raw_data_frame = self.data_api_impl.get_raw_data_frame(data_frame_name)
        print(raw_data_frame)

        # preprocess data
        preprocessed_data_frame = self.preprocessor_impl.preprocess_raw_data_frame(raw_data_frame, data_frame_name)
        print(preprocessed_data_frame)

        # get indexes list for data frame cross validation - a list of row numbers used to partition the data
        data_frame_indexes_list = self.cross_validator_impl.get_indexes_list(preprocessed_data_frame)

        if self.DEBUG:
            print('\ndata_frame_name --> ' + data_frame_name)
            print('\nraw_data_frame:\n')
            print(raw_data_frame)
            print('\npreprocessed_data_frame:\n')
            print(preprocessed_data_frame)
            print('\ndata_frame_indexes_list for cross validation:\n')
            print(data_frame_indexes_list)

        # nested dictionary to hold algorithm performance results for each combination of training/test sets
        # key pattern --> key = test_set_1 , where the number at the end of the key is the test set index
        # each value is another dictionary with keys = { 'zero_one_loss', 'mean_squared_error' }
        # the nested dictionary values are the corresponding loss function metrics for predictions using the test set
        cross_validation_results = {}

        # list of sizes of test sets used for getting average test set size
        test_set_sizes = []

        algorithm_parameters = self.parameter_tuner_impl.get_params(data_frame_name, algorithm)
        # dictionary where key is parameter and value is tuple of average loss function results
        results_by_parameter = {}

        # get all cross validation partitions for given data frame
        cv_partitions = self.cross_validator_impl.get_cv_partitions(preprocessed_data_frame)

        # for each parameter value in the list of algorithm parameter values (see ParameterTuner)
        for parameter in algorithm_parameters:

            if self.DEBUG:
                print('\n' + str(self.parameter_tuner_impl.get_parameter_key(algorithm)) + ': ' + str(parameter) + '\n')

            # for each test set used in cross validation (number of folds)
            for partition in cv_partitions:

                # initialize key and corresponding nested dictionary in results dictionary
                test_set_key = 'test_set_' + str(partition)
                cross_validation_results[test_set_key] = {}

                # get training set and test set for given cross validation partition
                training_set, test_set = cv_partitions[partition]

                test_set_sizes.append(test_set.shape[0]) # add number of rows in test set to test_set_sizes list

                if self.DEBUG:
                    print('preprocessed dataframe before running algorithm:')
                    print(preprocessed_data_frame)

                # run algorithms on training set / test set combination
                # returns dictionary where key is the row index (as string) and value is the predicted class for that row
                prediction_results = self.run_algorithm(data_frame_name, algorithm, training_set, test_set, \
                                                            preprocessed_data_frame, parameter)

                # calculate loss function results given prediction results - measure prediction accuracy
                accuracy, mean_squared_error = self.results_processor_impl.loss_function_analysis(test_set, prediction_results)

                cross_validation_results[test_set_key]['accuracy'] = accuracy
                cross_validation_results[test_set_key]['mean_squared_error'] = mean_squared_error

            # calculate average loss function results over all cross validation folds
            avg_accuracy, avg_mean_squared_error = self.results_processor_impl.get_avg_loss_vals(cross_validation_results)
            avg_test_set_size = sum(test_set_sizes) / len(test_set_sizes) # get average test set size for reference

            results_by_parameter[str(parameter)] = (avg_accuracy, avg_mean_squared_error)

            print('\n\nRESULTS: average test set size: ' + str(avg_test_set_size) + \
                ((' --> accuracy: ' + str(avg_accuracy)) if self.CLASSIFICATION \
                else (' --> mean_squared_error: ' + str(avg_mean_squared_error))))

            print('\n---------------------------------------------------------------------------------------------------------------------')

        # return dictionary of results by parameter
        return results_by_parameter


    def set_experiment_type(self, data_frame_name):
        if data_frame_name in ['abalone', 'car', 'segmentation']:
            self.CLASSIFICATION = True
            self.REGRESSION = False
        elif data_frame_name in ['machine', 'forestfires', 'wine']:
            self.REGRESSION = True
            self.CLASSIFICATION = False
        else:
            raise Exception('ERROR: unknown data_set_name --> ' + str(data_frame_name))


    '''
    run algorithm execution handler given algorithm name

    INPUT:
        - algorithm_name: name of algorithm to run handler for

    OUTPUT:
        - prediction results dictionary, maps instance index to tuple: (prediction, actual)
    '''
    def run_algorithm(self, data_set_name, algorithm_name, training_set, \
                        test_set, preprocessed_data_frame, parameter):
        if algorithm == 'knn':
            self.knn_impl.set_data_set(data_set_name)
            self.knn_impl.set_algorithm_name(algorithm_name)
            return self.knn_impl.do_knn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'enn':
            self.enn_impl.set_data_set(data_set_name)
            self.enn_impl.set_algorithm_name(algorithm_name)
            return self.enn_impl.do_enn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'cnn':
            self.cnn_impl.set_data_set(data_set_name)
            self.cnn_impl.set_algorithm_name(algorithm_name)
            return self.cnn_impl.do_cnn(training_set, test_set, preprocessed_data_frame, parameter)
        elif algorithm == 'kmeans_knn':
            self.kmeans_knn_impl.set_data_set(data_set_name)
            self.kmeans_knn_impl.set_algorithm_name(algorithm_name)
            return self.kmeans_knn_impl.cluster_do_knn(training_set, test_set, preprocessed_data_frame, data_set_name, parameter)
        elif algorithm == 'kmedoids_knn':
            self.k_medoids_clustering_impl.set_data_set(data_set_name)
            self.k_medoids_clustering_impl.set_algorithm_name(algorithm_name)
            return self.k_medoids_clustering_impl.cluster(training_set, test_set, preprocessed_data_frame, data_set_name, parameter)
Esempio n. 15
0
 def __init__(self):
     self.DEBUG = False
     self.data_api_impl = DataApi('../../data/')
     self.utilities_impl = Utilities()
Esempio n. 16
0
                print("Number of Previous Edits: ")
                print(number_of_edits_previous)
            loopcounter += 1
            print("Number of While Loops: ")

        return edited_train_set.reset_index(drop=True)


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('running edited knn...')
    edited_knn = EditedKNN()

    data_api_impl = DataApi('../../data/')
    cross_validator_impl = CrossValidator()
    preprocessor_impl = Preprocessor()

    wine_data = data_api_impl.get_raw_data_frame('segmentation')
    prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(
        wine_data, 'segmentation')

    wine_data_train_set = cross_validator_impl.get_training_set(
        prep_wine_data, test_set_number=3)
    print('wine_data_train_set.shape: ' + str(wine_data_train_set.shape))

    wine_data_test_set = cross_validator_impl.get_test_set(
        prep_wine_data, test_set_number, indexes_list)

    edited_knn.enn(wine_data_train_set, wine_data_test_set, prep_wine_data, k)
Esempio n. 17
0
 def __init__(self):
     KNN.__init__(self)
     self.DEBUG = False
     self.data_api_impl = DataApi('../../data/')
     self.enn_impl = EditedKNN()
     self.cnn_impl = CondensedKNN()
Esempio n. 18
0
import base64

from data_api import DataApi
from flask import Flask, url_for
from flask_cors import CORS
from flask_restful import Api, Resource
from model_api import ExplanationModel
from PIL import Image
from flask_restful import reqparse
from attribute_chunker import CounterFactualGenerator

app = Flask(__name__)
api = Api(app)
CORS(app)

data_api = DataApi()
explanation_model = ExplanationModel()
cf_gen = CounterFactualGenerator()


def any_response(data):
    ALLOWED = ["http://localhost:8888"]
    response = make_response(data)
    origin = request.headers["Origin"]
    if origin in ALLOWED:
        response.headers["Access-Control-Allow-Origin"] = origin
    return response


def npimg2base64(img):
    pil_img = Image.fromarray(img)
Esempio n. 19
0
class DataApiTests(unittest.TestCase):

    # SETUP

    @classmethod
    def setUpClass(self):
        self.DEBUG = False
        self.METRICS = False

        # construct DataApi instance with path prefix to data directory (relative from here)
        self.data_api_impl = DataApi('../../../data/')

    @classmethod
    def tearDownClass(self):
        pass

    # TESTS

    # test abalone data retrieval, number of rows/columns
    def test_get_abalone_data(self):
        abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
        self.assertTrue(abalone_data is not None)
        self.assertTrue(
            abalone_data.shape[0] == 4177)  # 4177 rows in abalone data matrix
        self.assertTrue(abalone_data.shape[1] ==
                        9)  # 9 attribute columns in abalone data matrix

    # test car data retrieval, number of rows/columns
    def test_get_car_data(self):
        car_data = self.data_api_impl.get_raw_data_frame('car')
        self.assertTrue(car_data is not None)
        self.assertTrue(
            car_data.shape[0] == 1728)  # 1728 rows in car data matrix
        self.assertTrue(
            car_data.shape[1] == 7)  # 7 attribute columns in car data matrix

    # test forestfires data retrieval, number of rows/columns
    def test_get_forestfires_data(self):
        forestfires_data = self.data_api_impl.get_raw_data_frame('forestfires')
        self.assertTrue(forestfires_data is not None)
        self.assertTrue(forestfires_data.shape[0] ==
                        518)  # 518 rows in forestfires data matrix
        self.assertTrue(forestfires_data.shape[1] ==
                        13)  # 13 attribute columns in forestfires data matrix

    # test machine data retrieval, number of rows/columns
    def test_get_machine_data(self):
        machine_data = self.data_api_impl.get_raw_data_frame('machine')
        self.assertTrue(machine_data is not None)
        self.assertTrue(
            machine_data.shape[0] == 209)  # 209 rows in machine data matrix
        self.assertTrue(machine_data.shape[1] ==
                        10)  # 10 attribute columns in machine data matrix

    # test segmentation data retrieval, number of rows/columns
    def test_get_segmentation_data(self):
        segmentation_data = self.data_api_impl.get_raw_data_frame(
            'segmentation')
        self.assertTrue(segmentation_data is not None)
        self.assertTrue(segmentation_data.shape[0] ==
                        213)  # 213 rows in segmentation data matrix
        self.assertTrue(segmentation_data.shape[1] ==
                        20)  # 20 attribute columns in segmentation data matrix

    # test wine data retrieval, number of rows/columns
    def test_get_wine_data(self):
        wine_data = self.data_api_impl.get_raw_data_frame('wine')
        self.assertTrue(wine_data is not None)
        self.assertTrue(
            wine_data.shape[0] == 6497)  # 6497 rows in wine data matrix
        self.assertTrue(wine_data.shape[1] ==
                        12)  # 12 attribute columns in wine data matrix
Esempio n. 20
0
class CrossValidatorTests(unittest.TestCase):


	# SETUP
	

	@classmethod
	def setUpClass(self):
		self.DEBUG = False
		self.METRICS = False

		self.data_api_impl = DataApi('../../../data/')
		self.cross_validator_impl = CrossValidator()
		self.preprocessor_impl = Preprocessor()


	@classmethod
	def tearDownClass(self):
		pass
		

	# TESTS

	'''
	# test get indexes list for abalone data
	def test_get_indexes_list_abalone_data(self):
		abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
		self.assertTrue(abalone_data is not None)
		abalone_indexes = self.cross_validator_impl.get_indexes_list(abalone_data)
		self.assertTrue(len(abalone_indexes) == 4177) # 4177 rows in abalone data frame
		for i in range(1, 10):
			self.assertTrue(abalone_indexes.count(i) == 417) # each subset has 417 rows
		self.assertTrue(abalone_indexes.count(10) == 424) # last subset has 417 + remaining...


	# test get indexes list for car data
	def test_get_indexes_list_car_data(self):
		car_data = self.data_api_impl.get_raw_data_frame('car')
		self.assertTrue(car_data is not None)
		car_indexes = self.cross_validator_impl.get_indexes_list(car_data)
		self.assertTrue(len(car_indexes) == 1728) # 1728 rows in car data frame
		for i in range(1, 10):
			self.assertTrue(car_indexes.count(i) == 172) # each subset has 172 rows
		self.assertTrue(car_indexes.count(10) == 180) # last subset has 172 + remaining...


	# test get indexes list for forest fires data
	def test_get_indexes_list_ff_data(self):
		ff_data = self.data_api_impl.get_raw_data_frame('forestfires')
		self.assertTrue(ff_data is not None)
		ff_indexes = self.cross_validator_impl.get_indexes_list(ff_data)
		self.assertTrue(len(ff_indexes) == 518) # 518 rows in forest fires data frame
		for i in range(1, 10):
			self.assertTrue(ff_indexes.count(i) == 51) # each subset has 51 rows
		self.assertTrue(ff_indexes.count(10) == 59) # last subset has 51 + remaining...


	# test get indexes list for machine data
	def test_get_indexes_list_machine_data(self):
		machine_data = self.data_api_impl.get_raw_data_frame('machine')
		self.assertTrue(machine_data is not None)
		machine_indexes = self.cross_validator_impl.get_indexes_list(machine_data)
		self.assertTrue(len(machine_indexes) == 209) # 209 rows in machine data frame
		for i in range(1, 10):
			self.assertTrue(machine_indexes.count(i) == 20) # each subset has 20 rows
		self.assertTrue(machine_indexes.count(10) == 29) # last subset has 20 + remaining...


	# test get indexes list for segmentation data
	def test_get_indexes_list_segmentation_data(self):
		segmentation_data = self.data_api_impl.get_raw_data_frame('segmentation')
		self.assertTrue(segmentation_data is not None)
		segmentation_indexes = self.cross_validator_impl.get_indexes_list(segmentation_data)
		self.assertTrue(len(segmentation_indexes) == 213) # 213 rows in segmentation data frame
		for i in range(1, 10):
			self.assertTrue(segmentation_indexes.count(i) == 21) # each subset has 21 rows
		self.assertTrue(segmentation_indexes.count(10) == 24) # last subset has 21 + remaining...


	# test get indexes list for wine data
	def test_get_indexes_list_wine_data(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		self.assertTrue(wine_data is not None)
		wine_indexes = self.cross_validator_impl.get_indexes_list(wine_data)
		self.assertTrue(len(wine_indexes) == 6497) # 6497 rows in wine data frame
		for i in range(1, 10):
			self.assertTrue(wine_indexes.count(i) == 649) # each subset has 649 rows
		self.assertTrue(wine_indexes.count(10) == 656) # last subset has 649 + remaining...


	# TRAINING SET


	# test get training set 2 with wine data
	def test_get_training_set(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		wine_data_training_set = self.cross_validator_impl.get_training_set(wine_data, 2)
		self.assertTrue(wine_data_training_set.shape[0] == 5848) # 6497 - 649 rows in test set 2 means 5484 rows in training set
		self.assertTrue(wine_data_training_set.shape[1] == 12) # number of columns does not change


	# TEST SET


	# test get test set (-2) with wine data
	def test_get_test_set(self):
		wine_data = self.data_api_impl.get_raw_data_frame('wine')
		wine_data_test_set = self.cross_validator_impl.get_test_set(wine_data, 2)
		self.assertTrue(wine_data_test_set.shape[0] == 649) # 649 rows in test set 2
		self.assertTrue(wine_data_test_set.shape[1] == 12) # number of columns does not change
	'''

	def test_cv_partitions(self):
		abalone_data = self.data_api_impl.get_raw_data_frame('abalone')
		prep_abalone_data = self.preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone')
		cv_partitions = self.cross_validator_impl.get_cv_partitions(prep_abalone_data)

		self.assertTrue(cv_partitions is not None)

		for partition in cv_partitions:
			train_data_indexes = list(cv_partitions[partition][0].index.values)
			test_data_indexes = list(cv_partitions[partition][1].index.values)
			for test_index in test_data_indexes:
				self.assertTrue(test_index not in train_data_indexes)
        centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data)
        # return a dictionary where key is the instance index and value is a tuple: (prediction, actual)
        print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count))
        return centroids_data



# EXECUTE SCRIPT


if __name__ == '__main__':

    print('k means clustering...')
    k_means_clustering_impl = KMeansClustering()

    data_api_impl = DataApi('../../data/')
    preprocessor_impl = Preprocessor()
    cross_validator_impl = CrossValidator()

    '''
    wine_data = data_api_impl.get_raw_data_frame('wine')
    prep_wine_data = preprocessor_impl.preprocess_raw_data_frame(wine_data, 'wine')
    '''

    abalone_data = data_api_impl.get_raw_data_frame('abalone')
    prep_abalone_data = preprocessor_impl.preprocess_raw_data_frame(abalone_data, 'abalone')

    print('\npossible classes: ' + str(list(set(abalone_data.loc[:, 'CLASS'].values))) + '\n')

    training_set, test_set = cross_validator_impl.get_cv_partitions(prep_abalone_data)[0]
Esempio n. 22
0
        trading_record: 格式为 [(1,datetime1)]
            1: 开多
            2: 开空
            -1: 平多
            -2: 平空
        """
        self.data.setdefault(local_symbol, {})["record"] = trading_record
        self.data.setdefault(local_symbol, {})["kline"] = [[
            str(kline.datetime), kline.open_price, kline.high_price,
            kline.low_price, kline.close_price, kline.volume
        ] for kline in klines]

    def render(self, path):
        for local_symbol, obj in self.data.items():
            with open(path, "w") as f:
                print(obj)
                kline_string = kline_template.render(draw_klines=obj["kline"],
                                                     bs=obj["record"])
                f.write(kline_string)


if __name__ == '__main__':
    plot = Plot("some")
    from data_api import DataApi

    code = "rb2105.SHFE"
    data_api = DataApi(uri="http://192.168.1.239:8124")
    kline = data_api.get_n_min_bar(code, 1, "2021-04-15", "2021-04-16")
    plot.add_kline(code, klines=kline, trading_record=[])
    plot.render("x.html")
class KMeansClustering(KNN):


    def __init__(self):
        KNN.__init__(self)
        self.DEBUG = True
        self.VERBOSE = False
        self.data_api_impl = DataApi('../../data/')
        self.utilities_impl = Utilities()
        self.distance_functions_impl = DistanceFunctions()

        # threshold for clustering convergence
        # stop iterating when differences between consecutive centroids is smaller than this
        self.CONVERGENCE_THRESHOLD = 0.25
        # maximum clustering iterations allowed before returning answer
        self.MAX_ITERATIONS = 5
        #self.MAX_CLUSTER_TIME = 5 # minutes


    '''
    perform k-means-clustering against full_data_frame using k value as parameter

    INPUT:
        - data_set_name: name of data set to cluster
        - train_data: training data set to cluster
        - k: value for parameter k, i.e. the number of clusters to partition the data set into

    OUTPUT:
        - tuple:
            - index 1: list of cluster keys representing the cluster each data point belongs to
            - index 2: centroids dataframe for k centroids (without class values)
    '''
    def cluster(self, data_set_name, train_data, k):

        print('\nk means clustering with k: ' + str(k))

        # get list of column labels for data set, not including the CLASS column label
        data_column_labels = self.data_api_impl.get_column_labels(data_set_name, include_class=False)
        # get training data without class column
        train_data = train_data.loc[:, train_data.columns != 'CLASS']

        # randomly generate k initial centroids from training data
        centroids = self.generate_initial_centroids(train_data, k)

        #print('generated initial centroids')

        if self.DEBUG:
            print('centroids:')
            print(centroids)

        if not isinstance(centroids, pd.DataFrame):
            # convert list of centroids to data frame using same column labels
            centroids_df = pd.DataFrame.from_records(centroids, columns=data_column_labels)
            #print('created centroids_df from records from centroids')
        else:
            centroids_df = centroids

        # combine the centroids_df and train_data dataframes into one frame with centroids first
        centroids_and_data_df = centroids_df.append(train_data, ignore_index=True)

        #print('created centroids_and_data_df')

        # get distance matrix - distance from every training point to every centroid
        distance_from_centroids = self.get_distance_matrix(centroids_and_data_df)

        #print('calculated distance matrix for centroids_and_data_df')

        if self.DEBUG and self.VERBOSE:
            print('cluster: train_data.shape: ' + str(train_data.shape))
            print('cluster: k: ' + str(k))
            print('cluster: number of centroids: ' + str(len(centroids)))
            print('centroids_and_data_df:')
            print(centroids_and_data_df)
            print('cluster: centroids_and_data_df.shape: ' + str(centroids_and_data_df.shape))
            print('cluster: distance_from_centroids.shape: ' + str(distance_from_centroids.shape))

        cluster_assignments = []
        iteration_count = 1

        # initial centroids
        previous_centroids = centroids_and_data_df.iloc[:k, :]
        # set initial centroids diffs to maximum number sizes
        centroids_diff = [sys.maxsize for i in range(k)]

        cluster_start_time = time.time()

        #print('right before clustering...')

        while not self.threshold_reached(centroids_diff) and iteration_count < self.MAX_ITERATIONS:

            print('clustering... iteration: ' + str(iteration_count))

            new_cluster_assignments = []

            # for each training point in the training data (start indexing at k)
            for instance_idx in range(k, centroids_and_data_df.shape[0]):
                data_point = centroids_and_data_df.iloc[instance_idx, :]
                # get list of distances from instance to each centroid
                idx_distances = np.array(distance_from_centroids[instance_idx][:k])
                # get index of centroid with least distance from instance
                closest_centroid_idx = np.argmin(idx_distances)
                # map instance index to centroid index (0 - k)
                new_cluster_assignments.append(closest_centroid_idx)

            #print('calculated new cluster assignments')

            cluster_assignments = new_cluster_assignments
            updated_centroids = []

            # update each centroid to mean of all points assigned to that centroid
            for centroid_idx in range(k):
                #print('updating mean for centroid: ' + str(centroid_idx))

                cluster_points = pd.DataFrame(columns=data_column_labels)
                np_cluster_assignments = np.array(cluster_assignments)
                # get list of indexes for all instances assigned to given centroid index
                idxs_for_cluster_val = np.where(np_cluster_assignments == centroid_idx)[0]

                for idx in idxs_for_cluster_val:
                    # add point to cluster points dataframe, add k to row index to skip centroid points
                    # NOT EFFICIENT - CHANGE THIS SO IT IS FASTER
                    cluster_points = cluster_points.append(centroids_and_data_df.iloc[idx+k, :])

                #print('built cluster points list')

                if self.DEBUG and self.VERBOSE:
                    print('centroid_idx: ' + str(centroid_idx))
                    print('idxs_for_cluster_val:' + str(idxs_for_cluster_val))
                    print('points in cluster ' + str(centroid_idx) + ': ' + str(cluster_points.shape))
                    print('cluster_points:')
                    print(cluster_points)

                avg_centroid = None

                # if there are points that were assigned to the cluster (centroid index)
                if cluster_points.shape[0] > 0:
                    avg_centroid = self.get_avg_centroid(cluster_points)
                    #print('got average centroid for idx: ' + str(centroid_idx))
                    if self.DEBUG and self.VERBOSE:
                        print('avg_centroid:')
                        print(avg_centroid)

                # do not update the centroid if any of the values in the centroid are nan
                if avg_centroid is not None and not np.isnan(np.array(avg_centroid)).any():
                    updated_centroids.append(avg_centroid)
                    #print('appended avg_centroid to updated_centroids list')
                else:
                    if self.DEBUG and self.VERBOSE:
                        print('ERROR: avg_centroid is none!')
                        print('bad cluster points:')
                        print(cluster_points)
                        print('avg_centroid:')
                        print(avg_centroid)
                    # keep previous centroid if the average centroid is still null (no points in cluster)
                    updated_centroids.append(list(previous_centroids.iloc[centroid_idx, :].values))
                    #print('appended previous centroid to updated_centroids list')

            # update centroids dataframe using list of updated centroids representing new average centroids
            updated_centroids_df = pd.DataFrame.from_records(updated_centroids, columns=data_column_labels)

            #print('created updated_centroids_df from records of updated_centroids')

            # update centroids in reference data frame (the one that contains the points too)
            for row_num in range(updated_centroids_df.shape[0]):
                centroids_and_data_df.iloc[row_num, :] = updated_centroids_df.iloc[row_num, :]

            #print('updated centroids in reference data frame centroids_and_data_df')

            # update distance matrix using new centroids for distance calculations
            distance_from_centroids = self.get_distance_matrix(centroids_and_data_df)

            #print('updated distance matrix using new centroids')

            # calculate distance between pairs of previous/new centroids to see if we've satisfied the threshold
            centroids_diff = self.get_centroids_diff(previous_centroids, updated_centroids_df)

            #print('calculated centroids diff')

            # BUG: fix the issue where the first centroid diff is always zero
            if centroids_diff[0] == 0:
                #iteration_count = iteration_count - 1 # remove this line?
                #print('decremented iteration_count to: ' + str(iteration_count) + ', because centroids_diff[0] == 0')
                #print('centroids_diff[0] == 0 !!!')
                iteration_count = iteration_count + 1
                continue # workaround for now
            else:
                print('\nclustering iteration: ' + str(iteration_count + 1))
                print('centroids_diff: ' + str(centroids_diff))

            # update previous centroids dataframe to updated centroids dataframe
            previous_centroids = updated_centroids_df
            #print('updated previous centroids using updated_centroids_df')
            iteration_count = iteration_count + 1

        print('cluster returning: %s, %s, %s' % (str(len(cluster_assignments)), \
                            str(updated_centroids_df.shape), str(iteration_count)))

        # return a tuple containing the final list of cluster assignments and the final centroids
        return (cluster_assignments, updated_centroids_df, iteration_count - 1)


    '''
    generate initial cluster centroids with random values in min/max range for each column

    INPUT:
        - data_frame: data to generate centroids for
        - k: k param value, i.e. number of centroids to generate

    OUTPUT:
        - list of centroid points with same dimensionality as regular data points
    '''
    def generate_initial_centroids(self, data_frame, k):
        '''
        # RANDOM GENERATION APPROACH
        centroids = []
        # get min/max values for each column (the bounds of the values for each column)
        column_bounds = self.utilities_impl.get_column_bounds(data_frame)
        num_cols = len(column_bounds)
        for centroid_index in range(k):
            centroid = []
            for col_index in range(num_cols):
                min_max_bounds = column_bounds[col_index]
                # randomly generate value in min/max range for each attribute
                centroid.append(random.uniform(min_max_bounds[0], min_max_bounds[1]))
            centroids.append(centroid)
        # return list of centroid points
        return centroids
        '''
        # RANDOM POINTS APPROACH
        indexes = random.sample(range(data_frame.shape[0]), k)
        # BUG: change this so it doesn't throw the pandas error in the log
        return data_frame.reindex(indexes)
        

    # return boolean indicating whether the centroid diff threshold has been reached
    def threshold_reached(self, centroids_diff):
        np_diffs = np.array(centroids_diff)
        # workaround for bug causing centroids_diff to be all zeros
        if np_diffs is None or np_diffs[0] == 0:
            return False
        # return boolean indicating whether any centroid diffs are greater than threshold
        return not list(np_diffs[np_diffs > self.CONVERGENCE_THRESHOLD])


    '''
    get average centroid from all points assigned to given cluster

    INPUT:
        - cluster_points: dataframe consisting of all points assigned to cluster

    OUTPUT:
        - centroid where each column is average value of respective column in cluster_points
    '''
    def get_avg_centroid(self, cluster_points):
        avg_col_vals = []
        # for each column in dataframe representing all points assigned to cluster
        for column_label, _ in cluster_points.items():
            column_vals = cluster_points.loc[:, column_label].values
            column_vals = [float(val) for val in column_vals]
            # calculate average column value and append to list
            avg_col_vals.append(stats.mean(column_vals))
            if self.DEBUG and self.VERBOSE:
                print('column_label: ' + str(column_label))
                print('len(column_vals): ' + str(len(column_vals)))
                print('column_vals: ')
                print(column_vals)
                print('avg_column_vals: ' + str(stats.mean(column_vals)))
        # return average centroid as list of average values for each column
        return avg_col_vals


    '''
    get diff between centroids from iteration n and iteration n+1
    '''
    def get_centroids_diff(self, previous_centroids, updated_centroids_df):
        if self.DEBUG and self.VERBOSE:
            print('previous_centroids:')
            print(previous_centroids)
            print('updated_centroids_df:')
            print(updated_centroids_df)
        centroid_diffs = []
        # for each centroid (instance) in the previous centroids dataframe
        for row_num in range(previous_centroids.shape[0]):
            prev_row = previous_centroids.iloc[row_num, :]
            updated_row = updated_centroids_df.iloc[row_num, :]
            # calculate euclidean distance between previous and updated centroid instance
            diff_dist = self.distance_functions_impl.get_euclidean_distance(prev_row, updated_row)
            centroid_diffs.append(diff_dist)
        # return list containing distances between each corresponding pair of centroids
        return centroid_diffs


    '''
    evaluate clustering - show counts
    '''
    def evaluate_clustering(self, data, clustering_assignments, k):
        print('\nCLUSTERING EVALUATION:')
        np_cluster_assignments = np.array(cluster_assignments)
        for centroid_idx in range(k):
            print('centroid_idx: ' + str(centroid_idx))
            freqs = {}
            idxs_for_cluster_val = np.where(np_cluster_assignments == centroid_idx)[0]
            for idx in idxs_for_cluster_val:
                if idx in data.index:
                    actual_class = str(data.loc[idx, 'CLASS'])
                    if actual_class in freqs:
                        freqs[actual_class] = freqs[actual_class] + 1
                    else:
                        freqs[actual_class] = 1
                else:
                    print('ERROR: ' + str(idx) + ' not in data.index!')
            print('freqs: ' + str(freqs))


    '''
    get distance matrix using pdist and squareform methods from scipy.spatial.distance

    INPUT:
        - data_frame: data frame we're working with

    OUTPUT:
        - distance matrix containing distances between every pair of points in data frame
    '''
    def get_distance_matrix(self, data_frame):
        # get distance matrix (upper triangle) using distance metric
        distances = pdist(data_frame.values, metric='euclidean')
        # fill in lower triangle maintaining symmetry
        dist_matrix = squareform(distances)
        # return full distance matrix
        return dist_matrix


    '''
    method for getting k cluster centroids using clustering output from cluster() method above
    the input 'centroids_data' is a dataframe that contains all the attribute value for the centroids
    this method is responsible for appending the corresponding class values to each centroid instance

    INPUT:
        - cluster_assignments - list of clustering assignments
        - centroids_data - centroids rows without class values

    OUTPUT:
        - data frame with k rows, representing k centroids
    '''
    def get_cluster_centroids(self, cluster_assignments, centroids_data, dataframe):
        if self.DEBUG and self.VERBOSE:
            print('get_cluster_centroids: unique cluster assignments: ' + str(set(cluster_assignments)))
            print('get_cluster_centroids: centroids_data.shape - BEFORE: ' + str(centroids_data.shape))
            print('get_cluster_centroids: dataframe.shape: ' + str(dataframe.shape))
            print('get_cluster_centroids: dataframe:')
            print(dataframe)

        # convert list of cluster assignments to np array to utilize np methods
        np_cluster_assignments = np.array(cluster_assignments)
        centroid_class_vals = []
        #print('len set cluster assignments: ' + str(len(set(cluster_assignments))))

        # for each unique cluster value (centroid index) that data points were assigned to
        for unique_cluster_val in set(cluster_assignments):
            # get dataframe row indexes that were assigned to the cluster
            val_idxs = np.where(np_cluster_assignments == unique_cluster_val)[0]
            #print('get_cluster_centroids: val_idxs assigned to unique_cluster_val: ' + str(unique_cluster_val) + ' --> ' + str(val_idxs))
            idx_class_vals = self.get_idx_class_vals(dataframe, val_idxs)
            #print('get_cluster_centroids: idx_class_vals for unique_cluster_val: ' + str(unique_cluster_val) + ' --> ' + str(idx_class_vals))
            highest_freq_class = self.utilities_impl.get_mode(idx_class_vals)
            print('highest_freq_class: ' + str(highest_freq_class))
            # append highest frequency class to list of centroid class values
            centroid_class_vals.append(highest_freq_class)

        #print('len centroid_class_vals: ' + str(len(centroid_class_vals)))
        #print('get_cluster_centroids: centroid_class_vals: ' + str(centroid_class_vals))

        # these values will not match if there were clusters that had no points assigned to them
        if len(centroid_class_vals) != centroids_data.shape[0]:
            # get list of all possible class values for given dataframe
            poss_class_vals = list(set(dataframe.loc[:, 'CLASS'].values))
            # randomly assign class values to missing clusters to make dimensions match
            # BUG: this shouldn't be necessary, handle missing clusters in a better way
            centroid_class_vals = self.handle_cluster_count_mismatch(\
                centroid_class_vals, centroids_data.shape[0], poss_class_vals)

        # append generated class values column to centroids dataframe (assigning class to each centroid)
        centroids_data['CLASS'] = centroid_class_vals

        if self.DEBUG and self.VERBOSE:
            print('get_cluster_centroids: centroids_data.shape - AFTER: ' + str(centroids_data.shape))
            print('get_cluster_centroids: centroids_data - AFTER:')
            print(centroids_data)

        # return complete centroids dataframe (now containing the corresponding class values for each centroid)
        return centroids_data


    # get class values from dataframe for all indexes in idxs arg
    def get_idx_class_vals(self, dataframe, idxs):
        class_vals = []
        for idx in idxs:
            row_data = dataframe.iloc[idx, :]
            class_vals.append(row_data['CLASS'])
        # return list of class values for idxs
        return class_vals


    # this shouldn't be necessary, but for now workaround by returning random class value
    def handle_cluster_count_mismatch(self, centroid_class_vals, expected_centroid_count, poss_class_vals):
        while len(centroid_class_vals) < expected_centroid_count:
            # BUG: this shouldn't be necessary, for now return random class value for missing clusters
            centroid_class_vals.append(random.choice(poss_class_vals))
        assert len(centroid_class_vals) == expected_centroid_count
        # return list of centroid class values with expected length
        return centroid_class_vals


    '''
    do full knn run through using k means clustering output as reduced data set for knn
    NOTE: this always sets k equal to the number of possible class values for the given data set

    INPUT:
        - train_data: training data that will be clustered
        - test_data: test data
        - dataframe: full dataframe
        - data_name: name of data set we're using
        - k: value for k parameter

    OUTPUT:
        - returns a dictionary where key is the instance index and value is a tuple: (prediction, actual)
    '''
    def cluster_do_knn(self, train_data, test_data, dataframe, data_name, k):
        # get number of possible class values for given data frame
        num_poss_class_vals = len(set(dataframe.loc[:, 'CLASS'].values))
        # k means cluster the training data to get list of final cluster assignments and resulting centroids
        cluster_assignments, centroids_data, iteration_count = self.cluster(data_name, train_data, k=num_poss_class_vals)

        '''
        would be cool if we did some cluster evaluation but this method isn't working for some reason...

        k_means_clustering_impl.evaluate_clustering(training_set, cluster_assignments, k=5)
        '''

        # get resulting centroids using highest frequency class value for each set of points in clusters
        centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data)
        # return a dictionary where key is the instance index and value is a tuple: (prediction, actual)

        print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count))

        return self.do_knn(centroids_data, test_data, dataframe, k)


    '''
    get cluster centroids for use in RBF network setup

    INPUT:
        - train_data: training data that will be clustered
        - dataframe: full dataframe
        - data_name: name of data set we're using
        - k: value of k parameter in base knn

    OUTPUT:
        - return cluster centroids for given training data and k value
    '''
    def get_centroids_for_rbf_network(self, train_data, dataframe, data_name, k):
        # get number of possible class values for given data frame
        num_poss_class_vals = len(set(dataframe.loc[:, 'CLASS'].values))
        # k means cluster the training data to get list of final cluster assignments and resulting centroids
        cluster_assignments, centroids_data, iteration_count = self.cluster(data_name, train_data, k=num_poss_class_vals)
        # get resulting centroids using highest frequency class value for each set of points in clusters
        centroids_data = self.get_cluster_centroids(cluster_assignments, centroids_data, train_data)
        # return a dictionary where key is the instance index and value is a tuple: (prediction, actual)
        print('K MEANS CLUSTERING CONVERGED. iterations: ' + str(iteration_count))
        return centroids_data
Esempio n. 24
0
        accuracy_vals = self.filter_vals(accuracy_vals)
        mean_squared_error_vals = self.filter_vals(mean_squared_error_vals)

        # calculate average values
        avg_accuracy = sum(accuracy_vals) / test_set_count
        avg_mean_squared_error = sum(mean_squared_error_vals) / test_set_count

        # return tuple with average values for zero_one_loss and mean_squared_error
        return (avg_accuracy, avg_mean_squared_error)

    def filter_vals(self, vals):
        filtered_vals = []
        for val in vals:
            if val is not None:
                filtered_vals.append(val)

        return filtered_vals


# EXECUTE SCRIPT

if __name__ == "__main__":

    print('running results...')

    results_impl = Results()
    data_api_impl = DataApi('../../data/')

    wine_data = data_api_impl.get_raw_data_frame('wine')
Esempio n. 25
0
 def __init__(self):
     self.DEBUG = False
     # construct DataApi instance with path prefix to data directory (relative from here)
     self.data_api_impl = DataApi('../../data/')
Esempio n. 26
0
 def __init__(self):
     KNN.__init__(self)
     self.DEBUG = True
     self.data_api_impl = DataApi('../../data/')
Esempio n. 27
0
        #Then scales to a 0-1 value (For comparison)
        for i, valuetype in enumerate(uniquevals):
            uniquevals[i] = i / (len(uniquevals) - 1)
        normalized = attribute.replace(attribute.unique(), uniquevals)

        return normalized


# EXECUTE SCRIPT

if __name__ == '__main__':

    print('running preprocessor...')
    preprocessor_impl = Preprocessor()

    data_api_impl = DataApi('../../data/')
    '''
    raw_abalone_data = data_api_impl.get_raw_data_frame('abalone')
    print('raw_abalone_data:')
    print(raw_abalone_data)
    prep_abalone_data = preprocessor_impl.preprocess_raw_data_frame(raw_abalone_data, 'abalone')
    print('prep_abalone_data:')
    print(prep_abalone_data)
    '''
    '''
    raw_car_data = data_api_impl.get_raw_data_frame('car')
    print('raw_car_data:')
    print(raw_car_data)
    prep_car_data = preprocessor_impl.preprocess_raw_data_frame(raw_car_data, 'car')
    print('prep_car_data:')
    print(prep_car_data)