def find_optimal_featureset(self, master_df, cv_df, features_list): 
        """Given a list of features (string form), returns the score and 
            dataframe of features that have the strongest predictive power
            on the cross validation set using 5-fold cross validation.  

            Combinations of the features list are generated and checked for 
            candidacy of being the very best that no one ever was 

            Returned score is R2 score (default for sklearn.LinearRegression)
        """
        meval = ModelEvaluator() 

        dataframe_list = meval.generate_feature_combinations(features_list, master_df)
        cv_dataframe_list = meval.generate_feature_combinations(features_list, cv_df) 

        both_lists = zip(dataframe_list, cv_dataframe_list)
        y = master_df['song_hotttnesss']
        y_cv = cv_df['song_hotttnesss']

        scores = []
        for df, cv in both_lists: 

            X = df 
            X_cv = cv 

            model = LinearRegression()   
            model.fit(X, y) 
            # score = meval.cross_validation_score(model, X_cv, y_cv, 5) 
            score = model.score(X_cv, y_cv)
            scores.append(score)


        max_idx = scores.index(max(scores))

        return dataframe_list[max_idx], scores[max_idx]
Example #2
0
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.model_evaluator = ModelEvaluator(self.model, self.vocabulary)
 def __reduce_overfitting(symbol, model_container):
     """ Recursive method to reduce Variance & get a better Validation score for the metric """
     print('-- Exploring Model Generalization --')
     print('On Trainng data...')
     model_container.train_score = ModelEvaluator.evaluate(
         model_container.model, model_container.data.train_X,
         model_container.data.train_y)
     print('On Test data...')
     model_container.val_score = ModelEvaluator.evaluate(
         model_container.model, model_container.data.val_X,
         model_container.data.val_y)
     print(
         f'Train score: {model_container.train_score} & Validation score: {model_container.val_score}'
     )
     if (
             model_container.train_score - model_container.val_score
     ) / model_container.train_score > 0.15 and model_container.hyperparams.dropout < 0.55 and model_container.train_score > 0.65:
         # Try improving generalisation if difference between training & validation score should be less than 10% (but if validation score is good then don't and don't continue if validation score is below threshold)
         model_container.hyperparams.dropout += 0.2
         model_container.model = Trainer.train_model(
             symbol, model_container.data_prep_params, model_container.data,
             model_container.hyperparams)
         return StockatronCore.__reduce_overfitting(symbol, model_container)
     else:
         return model_container
 def find_best_single_feature_parameters(self, dataset):
     for feature in dataset.suggested_discretize_features:
         permutations = self.generate_feature_parameters(feature)
         print(permutations)
         best_mean_fcs = self.best_fcs[dataset]
         best_perm = None
         for p, perm in enumerate(permutations):
             logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Current permutation: {}".format(dataset, feature, p+1,  perm))
             dm = DataModel.generate_from_file(dataset, discretize_params=perm)
             classes_list = dm.get_classes_list()
             f_scores = []
             a = 1
             for _ in range(self.best_fold[dataset][1]):
                 for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]):
                     model_evaluator = ModelEvaluator(train_set, test_set, classes_list)
                     model_evaluator.evaluate()
                     f_scores.append(model_evaluator.get_f_score())
                     logging.error("[Parameters Tester][{}][{}][Perm {:03d}][{:03d}] FCS: {}".format(dataset, feature, p+1, a, f_scores[-1]))
                     a += 1
             f_score_mean = sum(f_scores) / len(f_scores)
             logging.error("[Parameters Tester][{}][{}][Perm {:03d}] Best FCS: {}, Mean FCS {}".format(dataset, feature, p+1, max(f_scores), f_score_mean))
             if f_score_mean > best_mean_fcs:
                 best_perm = perm[0]
                 best_mean_fcs = f_score_mean
         if best_perm is not None:
             self.best_discretize_feature_params[dataset].append(best_perm)
         logging.error("[Parameters Tester][{}][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, feature, best_mean_fcs, best_perm))
Example #5
0
def run_model(model_name, data):
    dh = DataHandler()
    dh.import_data(data)
    dh.create_targets(-1)
    model = build_model(model_name, dh)
    m_eval = ModelEvaluator(model)
    acc, std = m_eval.n_time_k_cross_fold(10, 5)
    print('Accuracy: {}\nStandard Deviation: {}\n'.format(acc, std))
def hyperparameter(regressors,
                   datasets,
                   regualizer_values,
                   epochs=1000,
                   n_splits=5,
                   verbose=False):

    col_names = [''] * len(regressors)
    row_names = [''] * len(datasets)
    results = np.zeros(
        (len(datasets), len(regressors), len(regualizer_values), n_splits))

    for dataset_i, DatasetInitializer in enumerate(datasets):
        # intialize dataset
        dataset = DatasetInitializer()
        row_names[dataset_i] = dataset.name
        if verbose:
            print(dataset.name)

        for regressor_i, Regressor in enumerate(regressors):
            for regualizer_i, regualizer in enumerate(regualizer_values):
                # intialize model
                regression = Regressor(input_size=dataset.input_size,
                                       output_size=dataset.output_size,
                                       random_state=42,
                                       regualizer=regualizer,
                                       learning_rate=dataset.learning_rate)
                col_names[regressor_i] = regression.name
                if regualizer_i == 0 and verbose:
                    print('  ' + regression.name)

                with regression as model:
                    evaluator = ModelEvaluator(model,
                                               dataset.train,
                                               epochs=min(
                                                   epochs, dataset.epochs),
                                               random_state=42)
                    divergence = evaluator.all_folds(
                        n_splits=n_splits, stratified=dataset.stratified)
                    if verbose:
                        print('    %e: %f' % (regualizer, np.mean(divergence)))

                    results[dataset_i, regressor_i, regualizer_i, :] = \
                        divergence

    return (results, col_names, row_names)
Example #7
0
    def evaluate(self,
                 *,
                 images: np.ndarray = None,
                 folder_path: str = None) -> None:
        """
        Evaluate the model: calculate accuracy, show confusion matrix
        and print classification report. Works with either the images
        provided, or the path to these images. If none of them are
        provided, use the default test images path.
        """
        if not self._model_loaded:
            raise ModelNotLoadedError(
                'You have to load the model before evaluating it.')

        evaluator = ModelEvaluator(self._model,
                                   images=images,
                                   folder_path=folder_path)
        evaluator.evaluate()
Example #8
0
def _check_evaluation(preprocessor, model, metrics: Dict[str, float]):
    evaluator = ModelEvaluator(metrics_class=BinaryClassificationMetrics)
    # The purpose of this parameter is to prove names can be arbitrary in the compare method
    dataframes_sets = [['train', 'test'], ['train1', 'test1']]
    for dataframes in dataframes_sets:
        comparison = evaluator.compare(data_frames={
            dataframe: preprocessor.train_encoded_df
            for dataframe in dataframes
        },
                                       models=[model])

        assert isinstance(comparison, pandas.DataFrame)

        for metric in metrics:
            assert metric in comparison
            for dataframe in dataframes:
                assert \
                    comparison[metric][evaluator.index_key(dataframe, model)] == \
                    pytest.approx(metrics[metric], abs=0.05)
 def find_best_fold(self, dataset):
     dm = DataModel.generate_from_file(dataset)
     classes_list = dm.get_classes_list()
     for fold in FOLDS:
         f_scores = []
         a = 1
         for _ in range(fold[1]):
             for train_set, test_set in dm.generate_k_folds_stratified(fold[0]):
                 model_evaluator = ModelEvaluator(train_set, test_set, classes_list)
                 model_evaluator.evaluate()
                 f_scores.append(model_evaluator.get_f_score())
                 logging.error("[Parameters Tester][{}][CV{:02d}][{:03d}] FCS: {}".format(dataset, fold[0], a, f_scores[-1]))
                 a += 1
         f_score_mean = sum(f_scores) / len(f_scores)
         logging.error("[Parameters Tester][{}][CV{:02d}] Best FCS: {}, Mean FCS {}".format(dataset, fold[0], max(f_scores), f_score_mean))
         self.append_result({'dataset':dataset.name, 'fold':fold[0], 'f_score':f_score_mean, 'permutation':-1})
         if f_score_mean > self.best_fcs[dataset]:
             self.best_fold[dataset] = fold
             self.best_fcs[dataset] = f_score_mean
     logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best fold: {}".format(dataset, self.best_fcs[dataset], self.best_fold[dataset]))   
Example #10
0
def test_model(n_gram_mins):

    fe = FeatureExtractor("../dataset/slack_dialogue.txt",
                          n_grams=[1, 2, 3, 4],
                          n_gram_mins=n_gram_mins,
                          debug=False)
    fe.load()
    me = ModelEvaluator(fe.headers, fe.features)

    model_array, highest_rate = me.search_initial_best_fit_algorithm()
    chosen_model = model_array[random.randint(0, len(model_array) - 1)]
    mb = ModelBuilder(chosen_model)

    X_train, X_validation, Y_train, Y_validation = me.split_dataset()
    mb.fit_model(X_train, Y_train)
    accuracy_score = mb.accuracy_score(X_validation, Y_validation)

    print("Got score: " + str(accuracy_score) + " with model: " +
          str(model_array))
    print("Using : " + str(n_gram_mins))
    return accuracy_score, model_array
    def load(self, select_new_best_model=False):
        
        '''
        Reloads data from the file and selects the best model.
        
        Useful when there are automated updates to datasets.
        '''
        
        self.fe.load()
        self.me = ModelEvaluator(self.fe.headers, self.fe.features)        
        
        if select_new_best_model:
            self.me = ModelEvaluator(self.fe.headers, self.fe.features)
            self.model_array, self.highest_rate = self.me.search_initial_best_fit_algorithm()
            self.chosen_model = self.model_array[
                random.randint(0, len(self.model_array) - 1)]
            self.mb = ModelBuilder(self.chosen_model)

        self.X_train, self.X_validation, self.Y_train, self.Y_validation = self.me.split_dataset()
        self.mb.fit_model(self.X_train, self.Y_train)
        self.accuracy_score = self.mb.accuracy_score(self.X_validation,
                                                     self.Y_validation)
 def find_best_parameters(self, dataset):
     permutations = self.generate_permutations(dataset)
     for p, perm in enumerate(permutations):
         logging.error("[Parameters Tester][{}][Perm {:08d}] Current permutation: {}".format(dataset, p+1, perm))
         dm = DataModel.generate_from_file(dataset, discretize_params=perm)
         classes_list = dm.get_classes_list()
         f_scores = []
         a = 1
         for _ in range(self.best_fold[dataset][1]):
             for train_set, test_set in dm.generate_k_folds_stratified(self.best_fold[dataset][0]):
                 model_evaluator = ModelEvaluator(train_set, test_set, classes_list)
                 model_evaluator.evaluate()
                 f_scores.append(model_evaluator.get_f_score())
                 logging.error("[Parameters Tester][{}][Perm {:08d}][{:03d}] FCS: {}".format(dataset, p+1, a, f_scores[-1]))
                 a += 1
         f_score_mean = sum(f_scores) / len(f_scores)
         logging.error("[Parameters Tester][{}][Perm {:08d}] Best FCS: {}, Mean FCS {}".format(dataset, p+1, max(f_scores), f_score_mean))
         for param in perm:
             self.append_result({'dataset':dataset.name, 'fold':self.best_fold[dataset][0], 'f_score':f_score_mean, 'permutation':p + 1, 'feature':param.feature_name, 'function':param.discretize_function.__name__, 'bins':param.buckets_amount})
         if f_score_mean > self.best_fcs[dataset]:
             self.best_discretize_parameters[dataset] = perm
             self.best_fcs[dataset] = f_score_mean
     logging.error("[Parameters Tester][{}] Best mean FCS: {}, Best parameters: {}".format(dataset, self.best_fcs[dataset], self.best_discretize_parameters[dataset]))
def results(regressors, datasets, epochs=1000, verbose=False):
    '''Saves timings for regressors to filename.txt'''

    col_names = [''] * len(regressors)
    row_names = [''] * len(datasets)
    results = np.zeros((len(datasets), len(regressors), 2))

    for dataset_i, DatasetInitializer in enumerate(datasets):
        # intialize dataset
        dataset = DatasetInitializer()
        row_names[dataset_i] = dataset.name
        if verbose:
            print(dataset.name)

        for regressor_i, Regressor in enumerate(regressors):
            # intialize model
            regualizer = getattr(dataset.regualizer, Regressor.transform_type)
            regression = Regressor(input_size=dataset.input_size,
                                   output_size=dataset.output_size,
                                   random_state=42,
                                   regualizer=regualizer,
                                   learning_rate=dataset.learning_rate)
            col_names[regressor_i] = regression.name
            if verbose:
                print('  ' + regression.name)

            with regression as model:
                model.reset()
                model.update(dataset.train.inputs,
                             dataset.train.targets,
                             epochs=min(epochs, dataset.epochs))

                divergence = ModelEvaluator.evaluate(model,
                                                     dataset.test.inputs,
                                                     dataset.test.targets)
                results[dataset_i, regressor_i, 0] = divergence

                if dataset.multi_class:
                    missrate = np.nan
                else:
                    missrate = model.error(dataset.test.inputs,
                                           dataset.test.targets)
                results[dataset_i, regressor_i, 1] = missrate

                if verbose:
                    print('    %f / %f' % (divergence, missrate))

    return (results, col_names, row_names)
Example #14
0
 def train_model(self, symbol):
     models = []
     # clean up previous training plots
     for file in glob.glob(f'training_plots/{symbol}/*'):
         os.remove(file)
     df = yf.get_ticker(symbol, start_date=self.start_date)
     num_time_steps_to_try = [30]
     for num_time_steps in num_time_steps_to_try:
         data_prep_params = DataPrepParameters(
             scaler=StandardScaler(),
             num_time_steps=num_time_steps,
             features=['change', 'sp500_change'])
         data = self.data_chef.prepare_model_data(df, data_prep_params)
         for batch_size in [
                 1, 5
         ]:  # can try more batch sizes as stateless LSTM's only keep state/context within a batch so it's an important hyperparameter to explore
             hyperparams = ModelHyperparameters(
                 epochs=100,
                 number_hidden_layers=2,
                 number_units_in_hidden_layers=20,
                 hidden_activation_fn='tanh',
                 optimizer='adam',
                 dropout=0,
                 kernel_initializer="glorot_uniform",
                 batch_size=batch_size)
             model = Trainer.train_model(symbol, data_prep_params, data,
                                         hyperparams)
             model_container = StockatronCore.__reduce_underfitting(
                 symbol, model, hyperparams, data, data_prep_params)
             models.append(model_container)
             if model_container.train_score > 0.85:
                 break
     best_fit_model_container = max(models,
                                    key=operator.attrgetter("train_score"))
     best_fit_model_container = StockatronCore.__reduce_overfitting(
         symbol, best_fit_model_container)
     # Only now that the model has been selected, evaluate its worth using the untouched test set
     best_fit_model_container.test_score = ModelEvaluator.evaluate(
         best_fit_model_container.model,
         best_fit_model_container.data.test_X,
         best_fit_model_container.data.test_y)
     print(
         f'Best Model for {symbol} has train score={best_fit_model_container.train_score} validation score={best_fit_model_container.val_score} & test score={best_fit_model_container.test_score}'
     )
     best_fit_model_container.version = f'{symbol}_{date.today().strftime("%Y-%m-%d")}'
     StockatronCore.__save_new_model(best_fit_model_container)
Example #15
0
 def __reduce_underfitting(symbol, model, hyperparams, data,
                           data_prep_params):
     """ Recursive method to reduce Bias & get a better Training score for the metric """
     print('-- Exploring Model Fit --')
     train_score = ModelEvaluator.evaluate(model, data.train_X,
                                           data.train_y)
     if train_score < 0.7 and hyperparams.number_hidden_layers < 3:
         if hyperparams.epochs < 800:  # first run for longer
             hyperparams.epochs += 100
         elif hyperparams.number_hidden_layers < 5:  # if still not meeting the training score threshold then increase complexity of model
             hyperparams.number_hidden_layers += 1
         model = Trainer.train_model(symbol, data_prep_params, data,
                                     hyperparams)
         return StockatronCore.__reduce_underfitting(
             symbol, model, hyperparams, data, data_prep_params)
     else:
         return ModelContainer(model=model,
                               hyperparams=hyperparams,
                               data_prep_params=data_prep_params,
                               data=data,
                               train_score=train_score)
Example #16
0
class ModelEvaluatorTest(unittest.TestCase):

    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.model_evaluator = ModelEvaluator(self.model, self.vocabulary)

    def test_compute_loglikelihood(self):
        doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent
        document = Document(self.model.num_topics)
        rand = random.Random()
        rand.seed(0)
        document.parse_from_tokens(
                doc_tokens, rand, self.vocabulary, self.model)
        documents = [document, document]
        self.assertEqual(-14.113955684239654,
                self.model_evaluator.compute_loglikelihood(documents))
Example #17
0
 def evaluate_model_with_mean_absolute_error(self, prediction: np.ndarray):
     return ModelEvaluator(self.target, prediction).evaluate_mean_absolute_error()
class AlphaLayer:
    # Default constructor including path and debug toggle. Also includes count algorithm default and specification
    # This constructor does have a way to turn off the load of spacy. This is not meant for production and
    # should only be done to speed up debug times.
    def __init__(self, path, debug=False, count_algorithm=StandardPresenceBoolean(), load_spacy=True):
        self.spacy_loaded = load_spacy
        self.path = path
        self.debug = debug

        self.fe = FeatureExtractor(path, self.debug, count_algorithm=count_algorithm, load_spacy=load_spacy)
        
        self.load(True)
        
        if self.debug:
            print("Accuracy score: " + str(
                self.accuracy_score) + " with classifier " + self.chosen_model + " out of " + str(self.model_array))

    def load(self, select_new_best_model=False):
        
        '''
        Reloads data from the file and selects the best model.
        
        Useful when there are automated updates to datasets.
        '''
        
        self.fe.load()
        self.me = ModelEvaluator(self.fe.headers, self.fe.features)        
        
        if select_new_best_model:
            self.me = ModelEvaluator(self.fe.headers, self.fe.features)
            self.model_array, self.highest_rate = self.me.search_initial_best_fit_algorithm()
            self.chosen_model = self.model_array[
                random.randint(0, len(self.model_array) - 1)]
            self.mb = ModelBuilder(self.chosen_model)

        self.X_train, self.X_validation, self.Y_train, self.Y_validation = self.me.split_dataset()
        self.mb.fit_model(self.X_train, self.Y_train)
        self.accuracy_score = self.mb.accuracy_score(self.X_validation,
                                                     self.Y_validation)

    # Change dataset path
    def change_path(self, path):
        self.path = path
        self.fe.path = path

    # Append a line to the dataset. Caution: no formatting checks are done in this method.
    def add_line(self, line):
        with open(self.path, "a") as datafile:
            datafile.write('\n' + line)

    def handle_buy_item(self, sentence):
        return "Got classifier: buy\nThank you for purchasing " + self.evaluate(sentence)
        
    def handle_open_shop(self, sentence):
        return "Got classifier: shop\nHere you go, take a look at my wares.\n"
    
    def handle_conversation(self, sentence):
        return "Got classifier: convo\nI don't feel like talking to you"
    
    def handle_undo(self, sentence):
        return "Got classifier: undo\nReally mate? You sure you want to take it back?"

    def handle_sell(self, sentence):
        return "Got classifier: sell\nI'll gladly accept your " + self.evaluate(sentence)

    # Evaluate a string with spacy classifier
    def evaluate(self, line):
        doc = self.fe.parser(unicode(line))
        file_str = StringIO()
        current_string = ""
        compound_number = ""
        for word in doc:
            if word.pos_ == u'NOUN' or word.pos_ == u'PROPN':
                # Probably the thing we want to buy
                current_string += word.text + " "
            if word.pos_ == u'NUM':
                # This is an amount.
                compound_number += word.text + " "
            if word.pos_ == u'CONJ':
                # Consider this termination of the current item. Record amount and such.
                if current_string != "":
                    # Only terminate if they specified a thing to terminate with
                    current_amount = parse(compound_number.strip())
                    file_str.write(str(current_amount))
                    file_str.write(" ")
                    file_str.write(current_string.strip())
                    file_str.write("; ")
                    current_string = ""
            if word.pos_ == u'PUNCT':
                if word.text == u';':
                    current_amount = parse(compound_number.strip())
                    file_str.write(str(current_amount))
                    file_str.write(" ")
                    file_str.write(current_string.strip())
                    file_str.write("; ")
                    current_string = ""
                if word.text == u',':
                    current_amount = parse(compound_number.strip())
                    file_str.write(str(current_amount))
                    file_str.write(" ")
                    file_str.write(current_string.strip())
                    file_str.write("; ")
                    current_string = ""
        if current_string != "":
            current_amount = parse(compound_number.strip()) if compound_number.strip() != "" else 1
            file_str.write(str(current_amount))
            file_str.write(" ")
            file_str.write(current_string.strip())
        return file_str.getvalue()        
Example #19
0
def main():
    #from sklearn.datasets import load_boston
    #boston = load_boston()
    #print(boston.data.shape)
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO)
    dataFile = "data/housing.data"

    col_names = ["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", "medv"]
    train_df = pd.read_csv(dataFile, names = col_names, delim_whitespace = True)
    test_df = train_df.iloc[::7, :]
    train_df.drop(train_df.index[::7], inplace=True)

    train_df_features = train_df.iloc[:, :-1]
    train_df_targets = train_df.iloc[:, -1]
    test_df_features = test_df.iloc[:, :-1]
    test_df_targets = test_df.iloc[:, -1]

    # Data analysis
    print("Data analysis:")
    print("No. of attributes: ", len(train_df.iloc[0]))
    print("No. of features usable for classifcation: ", len(train_df.iloc[0])-1)
    print("Size of training data: ", len(train_df))
    print("Size of testing data: ", len(test_df))
    print("Histogram of attributes will be shown at the end of generating all results")

    print("\nPearson correlations:")
    target_col = col_names[-1]
    for col in col_names:
        if col.lower() == 'chas': # categorical. Also, see dtypes
            continue
        print("Correlation of %s with target(%s): %f" % (col, target_col, train_df[[col, target_col]].corr(method='pearson').iloc[0,1]))

    normalizer = DataFrameStdNormalizer(train_df_features)
    train_df_features_normalized = normalizer.get_normalized_data(train_df_features)
    test_df_features_normalized = normalizer.get_normalized_data(test_df_features)

    print("\n*********************Linear Regression*******************")
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    regmodel.train(train_df_features_normalized, train_df_targets)
    trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features_normalized, test_df_targets))

    print("\n***********Ridge regression with lambda 0.01m 0.1, 1.0***************")
    for lambdaval in (0.01, 0.1, 1.0):
        regmodel = RidgeRegression(lambdaval)
        eval = ModelEvaluator(regmodel)
        regmodel.train(train_df_features_normalized, train_df_targets)
        trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
        testingError = eval.mean_squared_error(test_df_features_normalized, test_df_targets)
        print("Ridge regression model with lambda = %f" % lambdaval)
        print("Mean squared error on training data = %f" % trainingError)
        print("Mean squared error on test data = %f" % testingError)
        print("")

    print("\n*********************Cross Validation*******************")
    lambdaval = float(10.0)

    # Shuffle data
    shuffled_train_df = train_df.reindex(np.random.permutation(train_df.index))
    shuffled_train_df_features = train_df.iloc[:, :-1]
    shuffled_train_df_targets = train_df.iloc[:, -1]
    shuffled_train_df_features_normalized = (DataFrameStdNormalizer(shuffled_train_df_features)).get_normalized_data(shuffled_train_df_features)

    lambda_error_map = {}
    for i in range(0,6):
        lambdaval = float(10.0) / (10**i)
        # cross validation
        mean_cv_error = 0
        regmodel = RidgeRegression(lambdaval)
        eval = ModelEvaluator(regmodel)
        for i in range(0,10):
            chunksize = len(train_df)/10
            test_df_cv = None
            train_df_cv_targets = None
            test_df_cv = None
            test_df_cv_targets = None
            test_df_cv = shuffled_train_df_features_normalized.iloc[i*chunksize:i*chunksize+chunksize]
            test_df_cv_targets = shuffled_train_df_targets.iloc[i*chunksize:i*chunksize+chunksize]

            train_df_cv = shuffled_train_df_features_normalized.drop(shuffled_train_df_features_normalized.index[i*chunksize:i*chunksize+chunksize])
            train_df_cv_targets = shuffled_train_df_targets.drop(shuffled_train_df_targets.index[i*chunksize:i*chunksize+chunksize])
            regmodel.train(train_df_cv, train_df_cv_targets)
            #print(eval.mean_squared_error(test_df_cv, test_df_cv_targets))
            mean_cv_error += eval.mean_squared_error(test_df_cv, test_df_cv_targets)
        mean_cv_error /= 10
        print("MSE for lambda %f = %f" % (lambdaval, mean_cv_error))
        lambda_error_map[lambdaval] = mean_cv_error

    lambdabest = min(lambda_error_map, key=lambda_error_map.get)
    print("Lowest MSE for lambda = %f" % lambdabest)
    regmodel = RidgeRegression(lambdabest)
    regmodel.train(train_df_features_normalized, train_df_targets)
    eval = ModelEvaluator(regmodel)
    test_meansquarederror = eval.mean_squared_error(test_df_features_normalized, test_df_targets)
    print("Test error for model with lambda %f = %f" % (lambdabest, test_meansquarederror))
    print("")

    print("\n*********************Feature Selection*******************")
    print("*********************i. Max correlation*******************")
    target_col = col_names[-1]
    corr = {}
    for col in col_names:
        if col.lower() == 'chas': # categorical. Also, see dtypes
            continue
        corr[col] = abs(train_df[[col, target_col]].corr(method='pearson').iloc[0,1])
    maxcorrcols = heapq.nlargest(5, corr, key=corr.get)[1:]
    print("Selecting the following coluns with max correlation: ")
    print(maxcorrcols)
    train_df_features_normalized_maxcorr = train_df_features[maxcorrcols]
    regmodel = LinearRegression()
    regmodel.train(train_df_features_normalized_maxcorr, train_df_targets)
    eval = ModelEvaluator(regmodel)
    trainingError = eval.mean_squared_error(train_df_features[maxcorrcols], train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[maxcorrcols], test_df_targets))

    print("*******************ii. Max correlation with residue*****************")
    residue = train_df_targets.copy(deep=True)
    cols = []
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    for i in range(0, 4):
        corr = {}
        for col in col_names:
            if col.lower() in ('medv', 'chas') or col in cols: # categorical. Also, see dtypes
                continue
            # corr[col] = train_df[[col]].corrwith(residue).iloc[0]
            corr[col] = abs(pd.concat([train_df[[col]], residue], axis = 1).corr(method='pearson').iloc[0,1])
        maxcorrcol = max(corr, key=corr.get)
        cols.append(maxcorrcol)
        print("Taking cols: %s" % maxcorrcol)
        regmodel.train(train_df_features[cols], train_df_targets)
        for i in range(0,len(residue)):
            residue.at[residue.index[i]] = train_df_targets.iloc[i] - regmodel.predict(train_df_features[cols].iloc[i])
        #trainingError = eval.mean_squared_error(train_df_features_normalized, train_df_targets)
        #print("Mean squared error on training data: %f" % trainingError)
        #print(cols)
        print("Mean squared error on train data: %f" % eval.mean_squared_error(train_df_features[cols], train_df_targets))
    print("Mean squared error on test data: %f" % eval.mean_squared_error(test_df_features[cols], test_df_targets))

    print("*********************iii. All 4 feature combinations*******************")
    bestcols = None
    besttrainmse = 999999
    regmodel = LinearRegression()
    eval = ModelEvaluator(regmodel)
    for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 4)):
        regmodel.train(train_df_features_normalized[cols], train_df_targets)
        mse_train = eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets)
        #print("Mean squared error on train data: %f" % )
        #print("Mean squared error on test data: %f" % eval.mean_squared_error(train_df_features_normalized[cols], train_df_targets))
        if mse_train < besttrainmse:
            bestcols = cols
            besttrainmse = mse_train
    print("Best training MSE = %f for columns:" % besttrainmse)
    print(bestcols)
    regmodel.train(train_df_features_normalized[bestcols], train_df_targets)
    print("Testing MSE of this model: %f" % eval.mean_squared_error(test_df_features_normalized[cols], test_df_targets))

    print("\n*********************Feature Expansion*******************")
    df_train_featuregen = train_df_features_normalized.copy(deep=True)
    df_test_featuregen = test_df_features_normalized.copy(deep=True)
    #i = 0
    for cols in list(list(x) for x in itertools.combinations(train_df_features_normalized.columns, 2)) + [[col,col] for col in train_df_features_normalized.columns]:
        #i += 1
        #print("Gen %d: %s" % (i,cols[0]+cols[1]))
        #df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1)
        df_train_featuregen[cols[0]+cols[1]] = df_train_featuregen[cols[0]]*df_train_featuregen[cols[1]]
        df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen[cols[0]]*df_test_featuregen[cols[1]]
        #df_test_featuregen[cols[0]+cols[1]] = df_test_featuregen.apply(lambda x: [[x[cols[0]], x[cols[1]]]], axis=1)
    regmodel = LinearRegression()
    regmodel.train(df_train_featuregen, train_df_targets)
    eval = ModelEvaluator(regmodel)
    trainingError = eval.mean_squared_error(df_train_featuregen, train_df_targets)
    print("Mean squared error on training data: %f" % trainingError)
    print("Mean squared error on test data: %f" % eval.mean_squared_error(df_test_featuregen, test_df_targets))

    print("\n******************************** Showing histogram of attributes********************************")
    Histogrammer.plot_histgram_of_features(train_df, 3, 5)
    print("\nClose window to terminate")
    #plt.show(block=False) #.draw()
    #plt.pause(0.001)
    #raw_input("Press enter to continue")
    plt.show()
    return
Example #20
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_path",
                        default='../config.ini',
                        required=False)

    args = parser.parse_args()

    cfg = OCTConfig(args.config_path)
    oct_logger = OCTLogger(cfg, RUN_TIMESTAMP)
    oct_logger.print_cfg()

    generator_resolver = GeneratorResolver(cfg)
    training_data_iterator, test_data_iterator, val_data_iterator = generator_resolver.resolve_data_iterators(
    )

    model_resolver = ModelResolver(cfg)
    model = model_resolver.resolve_model()

    augmented_image_data_generator = generator_resolver.provide_augmented_image_data_generator(
    )
    augmentation_processor = AugmentationProcessor(
        cfg, augmented_image_data_generator)
    augmentation_processor.perform_data_augmentation()

    model_trainer = ModelTrainer(cfg, model, training_data_iterator,
                                 val_data_iterator, RUN_TIMESTAMP)
    model_trainer.train_model()

    model_evaluator = ModelEvaluator(cfg, model, test_data_iterator)
    model_evaluator.evaluate_model()
Example #21
0
def main():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG if DEBUG else logging.INFO)

    g_x_list = []
    g_x_list.append(lambda xi: [1]) #g1
    g_x_list.append(lambda xi: [1]) #g2
    g_x_list.append(lambda xi: [1, xi]) #g3
    g_x_list.append(lambda xi: [1, xi, xi*xi]) #g4
    g_x_list.append(lambda xi: [1, xi, xi*xi, xi*xi*xi]) #g5
    g_x_list.append(lambda xi: [1, xi, xi*xi, xi*xi*xi, xi*xi*xi*xi]) #g6

    pretty_print_header("Bias Variance Trade-off")
    pretty_print_header("Generating datasets")
    datasets_10 = []
    datasets_100 = []
    for i in range(0, 100):
        datasets_10.append(generate_fx_dataset(10))
        datasets_100.append(generate_fx_dataset(100))

    pretty_print_header("(a) and (b)")
    for datasets, title in ((datasets_10, "Part (a)"), (datasets_100, "Part (b)")):
        avg_ws = []
        mses_g = []
        # g_1
        mses_g1 = []
        for i in range(0, 100):
            mses_g1.append(np.mean([(x[1] - 1)**2 for x in datasets[i]]))
        mses_g.append(mses_g1)
        avg_ws.append([1])

        # g_2
        mses_g2 = []
        avg_w2 = 0
        for i in range(0, len(datasets)):
            w_0 = sum([x[1] for x in datasets[i]]) / len(datasets)
            mses_g2.append(np.mean([(x[1] - w_0)**2 for x in datasets[i]]))
            avg_w2 += w_0
        mses_g.append(mses_g2)
        avg_w2 /= len(datasets)
        avg_ws.append([avg_w2])

        # g3-6
        for g_x in g_x_list[2:]:
            wmean = [0]*len(g_x(0))
            regmodel = KernelizedLinearRegression(g_x, 0)
            eval = ModelEvaluator(regmodel)
            meanerror = []
            mses = []
            for i in range(0, len(datasets)):
                regmodel.train([x[0] for x in datasets[i]], [x[1] for x in datasets[i]])
                meansquarederror_dataseti = eval.mean_squared_error([x[0] for x in datasets[i]], [x[1] for x in datasets[i]])
                mses.append(meansquarederror_dataseti)
                wmean = np.add(wmean, regmodel.w)
            mses_g.append(mses)
            wmean /= len(datasets)
            avg_ws.append(wmean)


        #print(mses_g)
        #print(np.subtract(mses_g[0],mses_g[1]))
        # print(mses_g[0])
        # print(mses_g[1])
        plot_histgram(pd.DataFrame(mses_g), 2, 3, "g", title)
    plt.show()

    pretty_print_header("Linear and Kernel SVM")
    train_data = loadmat('data/phishing-train.mat')
    test_data = loadmat('data/phishing-test.mat')
    train_features = train_data['features']
    train_labels = train_data['label']
    test_features = test_data['features']
    test_labels = test_data['label']

    train_df_features = pd.DataFrame(train_data['features'])
    train_df_labels = pd.DataFrame(train_data['label'])
    test_df_features = pd.DataFrame(test_data['features'])
    test_df_labels = pd.DataFrame(test_data['label'])

    categoricals_feature_columns = [1, 6, 7, 13, 14, 15, 25, 28]
    other_feature_columns = sorted(list(set(range(0,len(train_df_features.columns))) - set(categoricals_feature_columns)))

    df = train_df_features
    df_cat_train = pd.get_dummies(df[categoricals_feature_columns].applymap(str))
    df_others = df[other_feature_columns]
    train_df_features = df_others.join(df_cat_train.applymap(float))

    df = test_df_features
    df_cat_test = pd.get_dummies(df[categoricals_feature_columns].applymap(str))
    df_others = df[other_feature_columns]
    test_df_features = df_others.join(df_cat_test[df_cat_train.columns])

    # Feed to svmutil
    start = time.clock()
    crange = range(-6, 3)
    for c in crange:
        print("Evaluating svm for c=4^%d"%c)
        m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-c %f -v 3' % math.pow(4,c))
    print("Average training time=%fs" % ((time.clock() - start)/len(crange)))

    crange = range(-3,8)
    degrees = (1,2,3)
    start = time.clock()
    for c in crange:
        for degree in degrees:
            print("Evaluating svm for c=4^%d"%c)
            print("Degree = %d" % degree)
            m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -d %d' % (1, math.pow(4,c), degree))
    print("Average training time=%fs" % ((time.clock() - start)/(len(crange)*len(degrees))))

    gammedegrees = range(-7, -1)
    start = time.clock()
    for c in crange:
        for degree in gammedegrees:
            print("Evaluating svm for c=4^%d"%c)
            print("Gamme = %f" % math.pow(4,degree))
            m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -g %f' % (2, math.pow(4,c), math.pow(4,degree)))
    print("Average training time=%fs" % ((time.clock() - start)/(len(crange)*len(degrees))))
    #m = svm_load_model('heart_scale.model')
    #p_label, p_acc, p_val = svm_predict(y, x, m, '-b 1')
    #ACC, MSE, SCC = evaluations(y, p_label)

    #m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -v 3 -g %f' % (2, math.pow(4,c), math.pow(4,degree)))
    m = svm_train(train_labels.tolist()[0], train_df_features.values.tolist(), '-t %d -c %f -g %f' % (2, math.pow(4,2), math.pow(4,-2)))
    p_labs, p_acc, p_vals = svm_predict(test_labels.tolist()[0], test_df_features.values.tolist(), m)
    print(p_acc)
    print("Best performing model: rbf-kernel svm with c=4^2, gamma = -2")
Example #22
0
from evaluator import ZeroEvaluator
from match import Match
from mcts import MonteCarloTS
from model_evaluator import ModelEvaluator
from strategy_factory import StrategyFactory

if __name__ == "__main__":
    modelFactory = StrategyFactory(
        MonteCarloTS,
        playerOneEvaluator=ModelEvaluator(1, "../models/6x6eval.tf"),
        playerTwoEvaluator=ModelEvaluator(-1, "../models/deepeval.tf"),
        maxSeconds=3,
        alpha=1)
    modelFactoryTwo = StrategyFactory(
        MonteCarloTS,
        playerOneEvaluator=ModelEvaluator(2, "../models/6x6eval.tf"),
        playerTwoEvaluator=ModelEvaluator(-1, "../models/deepeval.tf"),
        maxSeconds=3,
        alpha=1)

    won, lost, drawn = 0, 0, 0
    for _ in range(100):
        for _ in range(100):
            match = Match(modelFactory, modelFactoryTwo)
            match.play()
            one, two = match.playerOne.state.score()
            if one > two:
                won += 1
            elif two > one:
                lost += 1
            else:
            ),
            Dense(
                second_layer,
                activation='relu',
                kernel_regularizer=regularizers.l2(0.01)
            ),
            Dense(1, activation='sigmoid')
        ]
    ))

# Create model evaluator.
model_evaluator = ModelEvaluator(
    dataset_extractor=dataset_extractor,
    seed_list=dataset_split_seeds,
    models=models,
    num_validation_runs=20,
    num_test_runs=100,
    percent_of_models_tested=0.2,
    evaluation_number=30
)

# Start timing model evaluation, if requested.
if(args.show_evaluation_time):
    start_time = time.time()

# Evaluate models.
model_evaluator.evaluate_models()
model_evaluator.save_results_as_csv()
model_evaluator.print_results()

# Print model evaluation time, if requested.