Example #1
0
 def __init__(self, algorithm):
     ''' init data preprocessor and classifier
     '''
     self.data_preprocessor = DataPreprocessor()
     if str(algorithm).lower() == 'decisiontree':
         self.clf = DecisionTreeClassifier()
     elif str(algorithm).lower() == 'randomforest':
         self.clf = RandomForestClassifier()
 def _preprocess_image(self, input_image):
     input_image = cv2.resize(input_image,
                              (self._input_shape[0], self._input_shape[1]))
     input_image = np.expand_dims(input_image, axis=0)
     preprocessor_prediction = DataPreprocessor(input_image)
     preprocessor_prediction.restore_preprocessing_parameters(
         file_name=self._parameter_file)
     input_image = preprocessor_prediction.get_reprocessed_data()
     return input_image
def generate_training_data():
   dp = DataPreprocessor()

   features_filenames = []
   for i in range(10) :
      features_filenames.append('data/raw/features/part-0000'+str(i)+'-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv')
   label_filename = 'data/raw/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv'

   dp.prepare_training_data(features_filenames, label_filename)
 def __init__(self, X, y):
     # remove zero covariance features and standardize
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y)
     self.number_features = X.shape[1]
     self.all_classes = np.unique(y)
     self.number_classes = self.all_classes.size
     self.prior, self.mean_array, self.std_array = self.calculate_Gauss_parameters(
         X, y)
 def __init__(self):
     self.embedding_size = 3
     self.epochs = 10
     self.hidden_state_size = 16
     self.data_sequence = DataPreprocessor(64, train=True)
     self.data_sequence.tokenizer.save_vocab()
     self.val_sequence = DataPreprocessor(64, train=False)
     self.history = None
     self.model_path: str = None
     self.model: KerasModel = None
Example #6
0
 def __init__(self, X, y):
     # remove zero covariance features and standardize
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y)
     self.number_features = X.shape[1]
     self.all_classes = np.unique(y)
     self.number_classes = self.all_classes.size
     self.W = self.calculate_weight_vector(X, y)
     self.prior, self.mean, self.covariance = \
         self.calculate_GaussGM_parameters(self.LDA_projection(X), y)
class NaiveBayes(Classifier):
    def __init__(self, X, y):
        # remove zero covariance features and standardize
        self.data_preprocessor = DataPreprocessor(X)
        X = self.data_preprocessor.process_data(X)
        y = np.copy(y)
        self.number_features = X.shape[1]
        self.all_classes = np.unique(y)
        self.number_classes = self.all_classes.size
        self.prior, self.mean_array, self.std_array = self.calculate_Gauss_parameters(
            X, y)

    def calculate_Gauss_parameters(self, X, y):
        prior = [np.sum(y == y_val) / y.size for y_val in self.all_classes]
        num_obs, _ = X.shape
        mean_array = np.zeros((self.number_classes, self.number_features))
        std_array = np.zeros((self.number_classes, self.number_features))
        for k in range(0, self.number_classes):
            index = y == self.all_classes[k]
            X_sub = X[index, :]
            mean_array[k, :] = np.mean(X_sub, axis=0)
            std_array[k, :] = np.std(X_sub, axis=0, ddof=1)
            std_array[std_array < 1e-03] = 1e-03
        return prior, mean_array, std_array

    def validate(self, X_test, y_test):
        X_test = self.data_preprocessor.process_data(X_test)
        assert X_test.shape[1] == self.number_features
        predicted_score = self.predict_score(X_test)
        predicted_class = self.predict_class(predicted_score)
        prediction_error = self.calculate_predict_error(
            predicted_class, y_test)
        return prediction_error

    def calculate_predict_error(self, predicted_class, y):
        predicted_indicator = np.array(
            [predicted_class[i] == y[i] for i in range(0, y.size)])
        return 1 - np.sum(predicted_indicator) / y.size

    def predict_class(self, predicted_score):
        max_indicator = np.argmax(predicted_score, axis=1)
        return np.array([self.all_classes[i] for i in max_indicator])

    def predict_score(self, X):
        N = X.shape[0]
        log_score = np.zeros((N, self.number_classes))
        for k in range(0, self.number_classes):
            for j in range(0, self.number_features):
                log_score[:, k] += norm.logpdf(X[:, j],
                                               loc=self.mean_array[k, j],
                                               scale=self.std_array[k, j])
        log_prior = [log(p) for p in self.prior]
        log_score += log_prior
        return log_score
Example #8
0
def _create_exploration_df(example_test, example_train, is_numeric=False):
    example_cols = ["column1"]
    test_df, train_df = _create_testing_dataframes(example_cols, example_test,
                                                   example_train)
    prep = DataPreprocessor(train_df=train_df, test_df=test_df)
    if is_numeric:
        factor_exploration = prep.explore_numeric_columns()
    else:
        factor_exploration = prep.explore_factors()

    assert len(factor_exploration) == 1
    return factor_exploration["column1"]
Example #9
0
 def __init__(self):
     # load config file
     with open("./config/predictionconfig.yml", "r") as ymlfile:
         cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
     self.interval = cfg['interval']
     self.threshold = cfg['single_threshold']
     # init DataPreprocessor
     self.data_preprocessor = DataPreprocessor()
     # init PredictionMaker
     self.prediction_maker = PredictionMaker()
     self.registry = CollectorRegistry()
     self.pushgateway_url = os.getenv('PUSHGATEWAY_URL')
 def __init__(self, X, y):
     # Data_Preprocessor will copy X
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y)
     self.all_classes = np.unique(y)
     self.number_classes = self.all_classes.size
     self.number_observations, self.number_features = X.shape
     # row-wise concatenated weight vector
     W_init = np.random.normal(0, 0.001,
                               self.number_classes * self.number_features)
     self.W = self.IRLS(W_init, X, y)
Example #11
0
 def __init__(self):
     # Definition of hyper parameter, data sources and other class variables
     self.embedding_dim = 3
     self.lstm_hidden_dim = self.embedding_dim
     self.max_decoder_length = 25
     self.epochs = 10
     self.data_sequence = DataPreprocessor(64, train=True, enc_dec=True, pad_to=self.max_decoder_length)
     self.data_sequence.tokenizer.save_vocab()
     self.val_sequence = DataPreprocessor(64, train=False, enc_dec=True, pad_to=self.max_decoder_length)
     self.history = None
     self.model_path: str = None
     self.model: KerasModel = None
Example #12
0
class PredictionPipeline():
    def __init__(self):
        # load config file
        with open("./config/predictionconfig.yml", "r") as ymlfile:
            cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
        self.interval = cfg['interval']
        self.threshold = cfg['single_threshold']
        # init DataPreprocessor
        self.data_preprocessor = DataPreprocessor()
        # init PredictionMaker
        self.prediction_maker = PredictionMaker()
        self.registry = CollectorRegistry()
        self.pushgateway_url = os.getenv('PUSHGATEWAY_URL')

    def run(self):
        while True:
            start_millis = int(round(time.time() * 1000))
            print("Starting pipeline...")

            # get data
            df = self.data_preprocessor.get_data()
            df = self.data_preprocessor.preprocess_data(df)

            if df.empty == False:

                # predict
                result = self.prediction_maker.make_prediction(df)
                end_millis = int(round(time.time() * 1000))
                prediction_millis = end_millis - start_millis
                prediction = Prediction(result)

                # apply changes to K8s Cluster
                prediction.apply(self.threshold)

                # push to prometheus gateway
                prediction.push_to_prometheus(self.registry,
                                              self.pushgateway_url)
                try:
                    g = Gauge('prediction_making_speed',
                              'Time in ms for making Prediction.',
                              registry=registry)
                except:
                    pass
                g.set(prediction_millis)
                push_to_gateway('{}:9091'.format(self.pushgateway_url),
                                job='prediction-maker',
                                registry=registry)
                # sleep until next interval
                print("Prediction took {} ms.".format(prediction_millis))

            print("Going back to sleep for {} sec...".format(self.interval))
            time.sleep(self.interval)
Example #13
0
 def __init__(self, X, y, regulator):
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y).astype(int)
     self.all_classes = np.unique(y)
     assert self.all_classes.size == 2
     self.target_value = np.array([1, -1]).astype(int)
     y[y == self.all_classes[0]] = self.target_value[0]
     y[y == self.all_classes[1]] = self.target_value[1]
     self.number_features = X.shape[1]
     alpha = self.solve_dual_problem(X, y, regulator)
     zero_threshold = 1e-6
     self.number_support_vectors = np.sum(alpha > zero_threshold)
     self.margin = 1 / np.linalg.norm(alpha)
     self.svm_weight, self.svm_bias = SVMCVX.compute_svm_parameters(alpha, X, y, regulator)
Example #14
0
def test_several_classification_models_fitting(preprocessor_train_data):
    df = preprocessor_train_data.train_df.sample(0.1)
    preprocessor = DataPreprocessor(train_df=df, test_df=df)
    preprocessor.prepare_to_model(target_col='income', to_strip=' .')

    models = ModelsContainer()
    models.fit(preprocessor.train_encoded_df, kind=ModelTypes.CLASSIFICATION)
    expected_results = [
        {
            "model": models.logistic_class.fitted_model,
            "metrics": {
                "areaUnderROC": 0.770414,
                "areaUnderPR": 0.646093
            },
        },
        {
            "model": models.random_forest_class.fitted_model,
            "metrics": {
                "areaUnderROC": 0.674751,
                "areaUnderPR": 0.664931
            },
        },
        {
            "model": models.gbt_class.fitted_model,
            "metrics": {
                "areaUnderROC": 0.811643,
                "areaUnderPR": 0.746147
            },
        },
        {
            "model": models.svm_class.fitted_model,
            "metrics": {
                "areaUnderROC": 0.750627,
                "areaUnderPR": 0.645328
            },
        },
        {
            "model": models.naive_bayes_class.fitted_model,
            "metrics": {
                "areaUnderROC": 0.615000,
                "areaUnderPR": 0.504709
            },
        },
    ]
    for result in expected_results:
        _check_evaluation(preprocessor=preprocessor,
                          model=result["model"],
                          metrics=result["metrics"])
Example #15
0
 def load_data_from_folder(self):
     filenames, class_ids, classes = self.__load_audio_filenames_with_class__()
     dataset_size = len(filenames)
     X_train = []
     y_train = []
     X_test = []
     y_test = []
     X_validation = []
     y_validation = []
     pool = Pool(cpu_count() - 1)
     preprocessor = DataPreprocessor(dataset_path=self.dataset_path)
     for (results, filepath, class_id, random_roll) in tqdm(pool.imap_unordered(preprocessor.process_file, zip_longest(filenames, class_ids)),
                                                            total=dataset_size):
         filepath = normpath(filepath)
         is_testing = 1 <= random_roll <= 10
         is_validation = 11 <= random_roll <= 20
         for item in results:
             if is_testing:
                 X_test.append(item)
                 y_test.append(class_id)
             elif is_validation:
                 X_validation.append(item)
                 y_validation.append(class_id)
             else:
                 X_train.append(item)
                 y_train.append(class_id)
     X_train = np.array(X_train)
     y_train = np.array(y_train)
     X_test = np.array(X_test)
     y_test = np.array(y_test)
     X_validation = np.array(X_validation)
     y_validation = np.array(y_validation)
     return X_train, y_train, X_test, y_test, X_validation, y_validation, classes
Example #16
0
 def test_preprocess_empty_data(self):
     """ test preprocess_data with empty df
     """
     df = pd.DataFrame(
         columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
     df = DataPreprocessor().preprocess_data(df)
     assert_frame_equal(df, df)
 def __init__(self, X, y, sgd_batch_size):
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y).astype(int)
     self.all_classes = np.unique(y)
     assert self.all_classes.size == 2
     self.target_value = np.array([1, -1]).astype(int)
     y[y == self.all_classes[0]] = self.target_value[0]
     y[y == self.all_classes[1]] = self.target_value[1]
     self.number_features = X.shape[1]
     # prepare for optimization
     self.loss_record = []
     penalty_lambda = 1
     w_init = np.zeros(self.number_features)
     w_init.fill(np.sqrt(1 / (self.number_features * penalty_lambda)))
     self.svm_weight = self.pegas(X, y, penalty_lambda, w_init,
                                  sgd_batch_size)
Example #18
0
 def test_preprocess_data(self):
     """ test preprocess_data with normal df
     """
     df = pd.DataFrame(
         [['1234', '1234', 'front-end,carts,front-end', '1234']],
         columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
     df = DataPreprocessor().preprocess_data(df)
     df_test = pd.DataFrame(
         [['1234', '1234', 'front-end,carts', '1234', 1]],
         columns=[
             'traceid', 'sessionid', 'servicessequence', 'starttime',
             'currentclusternumber'
         ])
     assert_frame_equal(df.sort_index(axis=1),
                        df_test.sort_index(axis=1),
                        check_dtype=False,
                        check_index_type=False)
 def __init__(self, X, y, sgd_batch_size):
     self.softplus_a = 0.1
     self.data_preprocessor = DataPreprocessor(X)
     X = self.data_preprocessor.process_data(X)
     y = np.copy(y).astype(int)
     self.all_classes = np.unique(y)
     assert self.all_classes.size == 2
     self.target_value = np.array([1, -1]).astype(int)
     y[y == self.all_classes[0]] = self.target_value[0]
     y[y == self.all_classes[1]] = self.target_value[1]
     n, self.number_features = X.shape
     assert sgd_batch_size <= n
     # prepare for optimization
     self.loss_record = []
     penalty_lambda = 1
     w_init = np.random.normal(0, 0.001, self.number_features)
     self.svm_weight = self.optimization(X, y, penalty_lambda, w_init,
                                         sgd_batch_size)
Example #20
0
def main(_):
  """
  MAIN FUNCTION - define loops for experiments
  """
  # Preprocess data: convert to pkl data
  if FLAGS.preprocess:
    for raw_data_fname in RAW_DATA_FNAME_LIST:
      data_preprocessor = DataPreprocessor(to_dir=DATA_DIR)
      data_preprocessor.process_and_save(raw_data_fname)

  # Used for loading data and building graph
  n_hidden_node_list = [100]
  n_hidden_layer_list = [1]

  # PARAM GRIDS
  param_grid_targets = [n_hidden_node_list, # path only
                        n_hidden_layer_list, # for final dense layers
                        ]
  param_product = product(*param_grid_targets)
  print(param_grid_targets)
  param_product_size = np.prod([len(t) for t in param_grid_targets])

  for i, params in enumerate(param_product):
    n_hidden_node, n_hidden_layer = params

    FLAGS.num_units = n_hidden_node
    FLAGS.n_hidden_node = n_hidden_node
    FLAGS.n_hidden_layer = n_hidden_layer

    # Model id
    id_components = [
        '{model}_{edim}x{layer}_last'.format(
            model=('B' if FLAGS.bi_direction else FLAGS.model_type[0].upper()),
            edim=n_hidden_node,
            layer=n_hidden_layer),
        # some details
    ]
    model_id = '__'.join(id_components)

    log.infov('=' * 30 + '{} / {} ({:.1f}%)'.format(
        i + 1, param_product_size, (i + 1) / param_product_size * 100) + '=' * 30)
    log.infov('model_id: ' + model_id)

    train_eval_save(car_id_list, FLAGS.dest_type, model_id, n_save_viz=FLAGS.n_save_viz)
Example #21
0
def live(params):
    sr = params['sampling_rate']
    audio_length = params['audio_length']
    blocksize = params['blocksize']
    sd.default.samplerate = sr
    sd.default.channels = params['channels']
    sd.default.blocksize = blocksize
    sd.default.latency = params['latency']

    weights_path = PROJECT_PATH / params['weights_file']

    classes = list(params['all_classes'])
    num_classes = len(classes)

    # model = get_tc_resnet_14((321, 40), num_classes, 1.5)
    model = get_tc_resnet_8((321, 40), num_classes, 1.5)
    model.load_weights(weights_path)
    model.summary()

    recent_signal = []
    recording_id = 0

    try:
        while True:
            input("Press Enter to start recording:")
            stream = sd.InputStream()
            stream.start()
            print("Say the word:")
            while True:
                data, overflowed = stream.read(blocksize)
                data = data.flatten()
                recent_signal.extend(data.tolist())
                if len(recent_signal) >= sr * audio_length:
                    recent_signal = recent_signal[:sr * audio_length]
                    break
            stream.close()
            rec_path = PROJECT_PATH / f'recording_{recording_id}.wav'
            sf.write(rec_path, np.array(recent_signal), sr)
            recording_id += 1
            print("Recording finished! Result is:")
            mfcc = DataPreprocessor.get_mfcc(np.asarray(recent_signal), sr)
            y_pred = model.predict(np.array([mfcc]))[0]
            result_id = int(np.argmax(y_pred))
            result_prob = y_pred[result_id]
            print("result id: " + str(result_id) + " " + classes[result_id] +
                  " " + str(result_prob))
            recent_signal = []
    except KeyboardInterrupt:
        print('Record finished!')
Example #22
0
class TestDataPreprocessor(unittest.TestCase):
    """ unittests for DataPreprocessor
    """
    def setUp(self):
        """ init DataPreprocessor
        """
        self.data_preprocessor = DataPreprocessor()

    def test_preprocess_empty_data(self):
        """ test preprocess_data with empty df
        """
        df = pd.DataFrame(
            columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
        df = self.data_preprocessor.preprocess_data(df)
        full_df = pd.DataFrame(
            columns=['sessionid', 'nextcluster', 'starttime'])
        assert_frame_equal(df, full_df)

    def test_preprocess_data(self):
        """ test preprocess_data with normal df
        """
        df = pd.DataFrame(
            [['1234', '1234', 'service-1,service-2,service-1', '1234']],
            columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
        df = self.data_preprocessor.preprocess_data(df)
        full_df = pd.DataFrame(
            [[0.0, '1234', '1234', 'service-1,service-2', '1234', 0, 0, 0]],
            columns=[
                'index', 'traceid', 'sessionid', 'servicessequence',
                'starttime', 'currentclusternumber', 'clustersequence',
                'nextcluster'
            ])
        assert_frame_equal(df.sort_index(axis=1),
                           full_df.sort_index(axis=1),
                           check_dtype=False,
                           check_index_type=False)
Example #23
0
    def run(self):
        """Performs various stages in predictive modeling"""
        #Path to Data set.
        path = "../../neeraj/resource/pima-indians-diabetes.data"
        #Column names of Data set.
        column_names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
        #Loading Data set using class DatasetLoader.
        load_data = DatasetLoader(path, column_names)
        data = load_data.load()
        load_data.print_shape(data)

        #Understanding data using class DataExplorer.
        explore_data = DataExplorer()
        explore_data.print_data_statistics(data)
        explore_data.visualize(data)

        #Performing data preprocessing.
        process_data = DataPreprocessor()
        input_set, output_set = process_data.split_dataset(data,0,8,8)
        process_data.display_dataset()
        process_data.summarize(input_set, 0, 5, 3)

        #Model evaluation using class Evaluator.
        evaluator = Evaluator()
        evaluator.validate(LogisticRegression(), input_set, output_set, 10, 7)
        evaluator.evaluate(LogisticRegression(), input_set, output_set, 10, 7,'log_loss')

        #Selecting best model using class ModelSelector.
        model = ModelSelector()
        #A set of models for selection.
        models = []
        models.append(( ' LR ' , LogisticRegression()))
        models.append(( ' LDA ' , LinearDiscriminantAnalysis()))
        models.append(( ' RF ' , RandomForestClassifier(n_estimators=100, max_features=3)))
        selected_model = model.select_model(models, input_set, output_set, 10, 7)
        print("\nSelected Model:\n %s") % (selected_model)

        #Improving Accuracy using class AccuracyImprover.
        improve_accuracy = AccuracyImprover()
        improve_accuracy.tuning(Ridge(),input_set, output_set)
        improve_accuracy.ensemble_prediction(RandomForestClassifier(n_estimators=100, max_features=3), input_set, output_set, 10, 7)

        #Finalizing the model and performing prediction.
        finalize_model = ModelFinalizer()
        input_train, input_test, output_train, output_test = finalize_model.split_train_test_sets(input_set, output_set, 0.33, 7)
        finalize_model.finalize_and_save(LogisticRegression(), "../../neeraj/resource/pima_model.sav", input_train, output_train)
        finalize_model.predict("../../neeraj/resource/pima_model.sav", input_test, output_test)
Example #24
0
    def __init__(self, args):
        """
        initial function of appPreprocessing
        :param args: parameter from CLI
        """
        self.args = args
        self.time_recoder = datetime.datetime.now()
        self.method_thread = None

        #self.interface = ClUI()
        self.interface = GraphUI("OmniPhotos Preprocessing")
        # register the callback function
        self.interface.before_exit = self.callback_exit
        AbsPreprocessor.abs_ui = self.interface  # register UI

        self.data_preproc = DataPreprocessor(args)
        self.traj_preproc = TrajPreprocessor(args)
        self.op_preproc = OpPreprocessor(args)
        self.of_preproc = OfPreprocessor(args)
Example #25
0
class LDA2dGaussGM(Classifier):
    def __init__(self, X, y):
        # remove zero covariance features and standardize
        self.data_preprocessor = DataPreprocessor(X)
        X = self.data_preprocessor.process_data(X)
        y = np.copy(y)
        self.number_features = X.shape[1]
        self.all_classes = np.unique(y)
        self.number_classes = self.all_classes.size
        self.W = self.calculate_weight_vector(X, y)
        self.prior, self.mean, self.covariance = \
            self.calculate_GaussGM_parameters(self.LDA_projection(X), y)

    def calculate_weight_vector(self, X, y):
        # hard coded for 2d projection
        k = 2
        X_kclass = {}
        for one_class in self.all_classes:
            X_kclass[one_class] = X[y == one_class]
        mean_all = np.mean(X, axis=0)
        S_T = np.matmul(np.transpose(X - mean_all), X - mean_all)
        S_W = np.zeros((self.number_features, self.number_features))
        for one_class in self.all_classes:
            mean_each = np.mean(X_kclass[one_class], axis=0)
            S_W += np.matmul(np.transpose(X_kclass[one_class] - mean_each),
                             X_kclass[one_class] - mean_each)
        S_B = S_T - S_W
        temp_mat = mat(np.linalg.inv(S_W)) * mat(S_B)
        _, eig_vecs = eigs(temp_mat, k=k)
        return eig_vecs.real

    def LDA_projection(self, X):
        assert X.shape[1] == self.W.shape[0]
        return X.dot(self.W)

    def calculate_GaussGM_parameters(self, X, y):
        number_features = X.shape[1]
        priors = [np.sum(y == one_class) / y.size for one_class in self.all_classes]
        means = np.zeros((self.number_classes, number_features))
        covariances = np.zeros((self.number_classes, number_features, number_features))
        for k in range(0, self.number_classes):
            index = y == self.all_classes[k]
            X_classk = X[index, :]
            means[k, :] = np.mean(X_classk, axis=0)
            covariances[k, :, :] = np.cov(X_classk, rowvar=False, bias=True)
        return priors, means, covariances

    def validate(self, X_test, y_test):
        X_test = self.data_preprocessor.process_data(X_test)
        assert X_test.shape[1] == self.number_features
        X_test = self.LDA_projection(X_test)
        predicted_scores = self.predict_score(X_test)
        predicted_class = self.predict_class(predicted_scores)
        test_error = self.calculate_predict_error(predicted_class, y_test)
        return test_error

    def calculate_predict_error(self, predicted_class, y):
        predicted_indicator = np.array([predicted_class[i] == y[i] for i in range(0, y.size)])
        return 1 - np.sum(predicted_indicator) / y.size

    def predict_class(self, score):
        max_indicator = np.argmax(score, axis=1)
        return np.array([self.all_classes[i] for i in max_indicator])

    def predict_score(self, X):
        N = X.shape[0]
        log_score = np.zeros((N, self.number_classes))
        for k in range(self.number_classes):
            mean_k = self.mean[k, :]
            cov_k = self.covariance[k, :, :]
            log_score[:, k] = multivariate_normal.logpdf(X, mean_k, cov_k)
        log_prior = [log(p) for p in self.prior]
        log_score += log_prior
        return log_score
Example #26
0
        modified_model = ModifiedReferenceCaffeNet(class_size)

        # copy W/b from the original model to the new one
        copy_model(original_model, modified_model)

        if args.gpu >= 0:
            chainer.cuda.get_device(args.gpu).use()  # make the GPU current
            modified_model.to_gpu()

        print("# _/_/_/ load dataset _/_/_/")

        in_size = ModifiedReferenceCaffeNet.IN_SIZE
        mean = np.load(mean_image_path)
        train = DataPreprocessor(training_data_path,
                                 root_dir_path,
                                 mean,
                                 in_size,
                                 random=True,
                                 is_scaled=True)
        test = DataPreprocessor(testing_data_path,
                                root_dir_path,
                                mean,
                                in_size,
                                random=False,
                                is_scaled=True)

        train_iter = chainer.iterators.MultiprocessIterator(
            train, args.batch_size, n_processes=args.loader_job)
        test_iter = chainer.iterators.MultiprocessIterator(
            test,
            args.test_batch_size,
            repeat=False,
class LogisticRegression(Classifier):
    def __init__(self, X, y):
        # Data_Preprocessor will copy X
        self.data_preprocessor = DataPreprocessor(X)
        X = self.data_preprocessor.process_data(X)
        y = np.copy(y)
        self.all_classes = np.unique(y)
        self.number_classes = self.all_classes.size
        self.number_observations, self.number_features = X.shape
        # row-wise concatenated weight vector
        W_init = np.random.normal(0, 0.001,
                                  self.number_classes * self.number_features)
        self.W = self.IRLS(W_init, X, y)

    def IRLS(self, W, X, y):
        # construct YT to compute gradients and hessian
        T = np.zeros((self.number_observations, self.number_classes))
        Y = np.zeros((self.number_observations, self.number_classes))
        # through iterations
        number_iterations = 30
        loss_record = np.zeros(number_iterations)
        for iter in range(number_iterations):
            W_mat = self.W_vector2matrix(W)
            for i in range(self.number_observations):
                T[i, y[i]] = 1
                Y[i, :] = LogisticRegression.softmax(W_mat, X[i, :])
            loss_record[iter] = LogisticRegression.cross_entropy_loss(Y, T)
            grad_W = self.compute_gradient(X, Y, T)
            hess_W = self.compute_hessian(X, Y)
            W += -0.01 * np.matmul(np.linalg.inv(hess_W), grad_W)
            # W += - 0.01 * grad_W
        return self.W_vector2matrix(W)

    def compute_gradient(self, X, Y, T):
        grad_mat = np.zeros((self.number_classes, self.number_features))
        for i in range(self.number_classes):
            grad_mat[i, :] = (Y[:, i] - T[:, i]).dot(X)
        return grad_mat.reshape(self.number_classes * self.number_features)

    def cross_entropy_loss(Y, T):
        loss = 0
        N, K = Y.shape
        for n in range(N):
            for k in range(K):
                loss += -T[n, k] * log(Y[n, k])
        return loss

    def compute_hessian(self, X, Y):
        hess_mat = np.zeros((self.number_classes * self.number_features,
                             self.number_classes * self.number_features))
        for j in range(self.number_classes):
            for k in range(self.number_classes):
                i_kj = 1 if (k == j) else 0
                dot_vec = Y[:, k] * (i_kj - Y[:, j])
                block_kj = np.matmul(np.matmul(X.T, np.diag(dot_vec)), X)
                hess_mat[j * self.number_features : (j + 1) * self.number_features, \
                k * self.number_features : (k + 1) * self.number_features] = block_kj
        # hessian may not be PSD due to numerical issue
        hess_mat = hess_mat + 0.1 * np.identity(
            self.number_classes * self.number_features)
        return hess_mat

    def W_vector2matrix(self, W_vec):
        assert (W_vec.size == self.number_classes * self.number_features)
        return W_vec.reshape((self.number_classes, self.number_features))

    def softmax(W, x):
        e = np.exp(W.dot(x))
        dist = e / np.sum(e)
        return dist

    def validate(self, X_test, y_test):
        X_test = self.data_preprocessor.process_data(X_test)
        assert X_test.shape[1] == self.number_features
        predicted_score = self.predict_score(X_test)
        predicted_class = self.predict_class(predicted_score)
        test_error = self.calculate_predict_error(predicted_class, y_test)
        return test_error

    def calculate_predict_error(self, predicted_class, y):
        predicted_indicator = np.array(
            [predicted_class[i] == y[i] for i in range(0, y.size)])
        return 1 - np.sum(predicted_indicator) / y.size

    def predict_class(self, score):
        max_indicator = np.argmax(score, axis=1)
        return np.array([self.all_classes[i] for i in max_indicator])

    def predict_score(self, X):
        N = X.shape[0]
        softmax_score = np.zeros((N, self.number_classes))
        for i in range(N):
            softmax_score[i, :] = LogisticRegression.softmax(self.W, X[i, :])
        return softmax_score
Example #28
0
# Configurations
timesteps = 10
hidden_neurons = 50
epochs = 300
batchsize = 10

# Load data
nikkei_data_org, nasdaq_data_org, currency_data_org = data_loader.load_dataset()

# Data Preprocessing
dropping_features_for_nikkei = ['Open Price', 'High Price', 'Low Price']
dropping_features_for_nasdaq = ['High', 'Low', 'Total Market Value', 'Dividend Market Value']
dropping_features_for_currency = ['High (est)', 'Low (est)']

nikkei_data = DataPreprocessor(nikkei_data_org).preprocess_data(dropping_features_for_nikkei)
nasdaq_data = DataPreprocessor(nasdaq_data_org).preprocess_data(dropping_features_for_nasdaq)
currency_data = DataPreprocessor(currency_data_org).preprocess_data(dropping_features_for_currency)

merged_data = DataPreprocessor.merge(nikkei_data, nasdaq_data, currency_data)
data = merged_data.dropna()
data.to_csv("data/data.csv")

# Split the data
data_train, data_val, data_test = DataSplitter.split_to_train_val_test(data)
x_train, y_train = DataSplitter.split_to_x_and_y(data_train, timesteps=timesteps)
x_val, y_val = DataSplitter.split_to_x_and_y(data_val, timesteps=timesteps)
x_test, y_test = DataSplitter.split_to_x_and_y(data_test, timesteps=timesteps)

print("Train dataset has {} samples.".format(*x_train.shape))
# print(x_train[:3])
Example #29
0
import calendar
import time

files = []
for r, d, f in os.walk('data/raw/features/'):
    for file in f:
        if '.csv' in file:
            files.append(os.path.join(r, file))
features = []
for f in files:
    df = pd.read_csv(f)
    features.append(df)
features = pd.concat(features)
features = features.sort_values(by=['bookingID', 'second'])

dp = DataPreprocessor()
features = dp.feature_engineering(features)
#features = pd.read_csv('data/processed/features_1560750238.csv')

files = []
for r, d, f in os.walk('data/raw/labels/'):
    for file in f:
        if '.csv' in file:
            files.append(os.path.join(r, file))
labels = []
for f in files:
    df = pd.read_csv(f)
    labels.append(df)
true_values_exist = True
if (len(labels) == 0):
    true_values_exist = False
Example #30
0
class SVMCVX(Classifier):
    def __init__(self, X, y, regulator):
        self.data_preprocessor = DataPreprocessor(X)
        X = self.data_preprocessor.process_data(X)
        y = np.copy(y).astype(int)
        self.all_classes = np.unique(y)
        assert self.all_classes.size == 2
        self.target_value = np.array([1, -1]).astype(int)
        y[y == self.all_classes[0]] = self.target_value[0]
        y[y == self.all_classes[1]] = self.target_value[1]
        self.number_features = X.shape[1]
        alpha = self.solve_dual_problem(X, y, regulator)
        zero_threshold = 1e-6
        self.number_support_vectors = np.sum(alpha > zero_threshold)
        self.margin = 1 / np.linalg.norm(alpha)
        self.svm_weight, self.svm_bias = SVMCVX.compute_svm_parameters(alpha, X, y, regulator)

    def solve_dual_problem(self, X, y, c):
        # QP problem
        # min 0.5 * xTPx + qTx
        # st Gx <= h, Ax = b
        number_observations = X.shape[0]
        yX = np.reshape(y, (number_observations, 1)) * X
        P = matrix(yX.dot(yX.T))
        q = matrix(-np.ones(number_observations))
        A = matrix(np.reshape(y.astype(float), (1, number_observations)))
        b = matrix([0.0])
        I = np.identity(number_observations)
        G = matrix(np.concatenate((I, -I), axis=0))
        vector_c = c * np.ones(number_observations)
        vector_0 = np.zeros(number_observations)
        h = matrix(np.concatenate((vector_c, vector_0)))
        solution = qp(P, q, G, h, A, b)
        alpha = np.array(solution['x'])
        return alpha.reshape((-1,))

    def compute_svm_parameters(alpha, X, y, c):
        w = (alpha * y).dot(X)
        b = 0
        count = 0
        for i in range(alpha.shape[0]):
            if 0 < alpha[i] < c:
                count += 1
                b += y[i] - w.dot(X[i, :])
        assert count > 0
        b /= count
        return w, b

    def predict(self, X_new):
        X = self.data_preprocessor.process_data(X_new)
        X = np.reshape(X, (-1, self.number_features))
        predicted_score = self.predict_score(X)
        predicted_class = self.predict_class(predicted_score)
        return predicted_class

    def validate(self, X_test, y_test):
        X_test = self.data_preprocessor.process_data(X_test)
        assert X_test.shape[1] == self.number_features
        predicted_score = self.predict_score(X_test)
        predicted_class = self.predict_class(predicted_score)
        test_error = self.calculate_predict_error(predicted_class, y_test)
        return test_error

    def calculate_predict_error(self, predicted_class, y):
        predicted_indicator = np.array([predicted_class[i] == y[i] for i in range(0, y.size)])
        return 1 - np.sum(predicted_indicator) / y.size

    def predict_class(self, score):
        max_indicator = np.argmax(score, axis=1)
        return np.array([self.all_classes[i] for i in max_indicator])

    def predict_score(self, X):
        N = X.shape[0]
        svm_score = np.zeros((N, 2))
        svm_score[:, 0] = X.dot(self.svm_weight) + self.svm_bias
        svm_score[:, 1] = -svm_score[:, 0]
        return svm_score