Ejemplo n.º 1
0
 def __init__(self, number_of_gaussian):
     self.ng = number_of_gaussian
     self.gmm = GMM(number_of_gaussian)
     self.comparator_with_impostor = ComparatorWithImpostor(
         number_of_gaussian)
     self.comparator = Comparator(number_of_gaussian)
     self.cross_validator = CrossValidation(0.3)
Ejemplo n.º 2
0
    def prepare(self):
        # Generate author ids and make sure everything is prepared for training
        # without db
        
        # Make sure author_id are generated (shared by both nets)
        net = self.create_net('hindex_cumulative')
        with CrossValidation(net) as cv:
            for _ in cv(num=self.num_cv, load=False):
                pass
        
        # Make sure x data is generated (shared by both nets)
        net.load_data_x()
        
        # Make sure y data are generated
        for target, tasks in self.targets.items():
            net = self.create_net(target)
        
            for task in tasks(net):
                for years in task.get_years_range():
                    net.predict_after_years = years
                    net.load_data_y()

        # Make sure author id files from NoHindex0 tasks are generated
        # May be nicer to put a general prepare() method in Task class
        for target, tasks in self.targets.items():
            net = self.create_net(target)
            with CrossValidation(net) as cv:
                for _ in cv(num=self.num_cv, load=True, load_to_db=True):
                    for task in tasks(net):
                        if isinstance(task, PredictivityOverTimeNoHindex0):
                            task.get_nonzero_author_ids()
Ejemplo n.º 3
0
    def test_code_string_reports_correctly(self):
        """Check that a CrossValidation analysis object collects and reports
        the set of EntityList code strings correctly"""
        # Create two mock entities
        ent1 = mock.create_autospec(Entity)
        ent2 = mock.create_autospec(Entity)

        # Create a mock EntityList. Set the code_string attribute
        # to be another mock, then set a side_effect to fake out a return
        # value. Set a fake category
        entity_list1 = mock.create_autospec(EntityList)
        entity_list1.code_string = Mock()
        entity_list1.code_string.side_effect = ["261u"]
        entity_list1.category = "category1"

        # Repeat for a second mock EntityList using the same category
        entity_list2 = mock.create_autospec(EntityList)
        entity_list2.code_string = Mock()
        entity_list2.code_string.side_effect = ["35r"]
        entity_list2.category = "category1"

        # Repeat for a third mock EntityList using a different category
        entity_list3 = mock.create_autospec(EntityList)
        entity_list3.code_string = Mock()
        entity_list3.code_string.side_effect = ["299r"]
        entity_list3.category = "category2"

        entities = [ent1, ent2]
        entity_lists = [entity_list1, entity_list2, entity_list3]

        tested_object = CrossValidation(entities, entity_lists, 0.5, 99)
        self.assertEqual("261u.35r|299r", tested_object.code_string(),
                         "Unexpected code string from CrossValidation object")
Ejemplo n.º 4
0
    def summarize_examples(self):
        
        print("Example trajectories...")
        net = self.create_net('hindex_cumulative')
        with CrossValidation(net) as cv:
            for _ in cv(load=[0], load_to_db=True):
                net.plot_example_trajectories()
        
        print("Example scatter net...")
        net = self.create_net('sqrt_nc_after')
        with CrossValidation(net) as cv:
            # Need DB for hindex predictor
            for _ in cv(load=[0], load_to_db=True):
                # Simulate exclude = []
                net.suffix += '-' + '-'.join([])
                net.plot_correlation()
                net.plot_correlation_hindex()

        print("Example scatter rf...")
        rf = self.create_rf('sqrt_nc_after')
        with CrossValidation(rf) as cv:
            for _ in cv(load=[0], load_to_db=False):
                # Simulate exclude = []
                rf.suffix += '-' + '-'.join([])
                rf.plot_correlation()
Ejemplo n.º 5
0
def question_4(points):
    """
    question 4
    :param points: list of Point
    """
    k_list = [5, 7]
    normalization_list = [[DummyNormalizer, "DummyNormalizer"],
                          [SumNormalizer, "SumNormalizer"],
                          [MinMaxNormalizer, "MinMaxNormalizer"],
                          [ZNormalizer, "ZNormalizer"]]
    print("Question 4:")
    for k in k_list:
        print("K=", k, sep="")
        m = KNN(k)
        m.train(points)
        cv = CrossValidation()
        for i in normalization_list:
            normalize_object = i[0]()
            normalize_object.fit(points)
            new_points = normalize_object.transform(points)
            #  2 is the best n-fold
            average_score = cv.run_cv(new_points, 2, m, accuracy_score, False,
                                      True)
            formatted_average_score = "{:.2f}".format(average_score)
            print("Accuracy of", i[1], "is", formatted_average_score)
            print()
Ejemplo n.º 6
0
Archivo: main.py Proyecto: crab-a/lab4
def run_knn(points):
    m = KNN(5)
    m.train(points)
    print(f'predicted class: {m.predict(points[0])}')
    print(f'true class: {points[0].label}')
    cv = CrossValidation()
    cv.run_cv(points, 10, m, accuracy_score)
Ejemplo n.º 7
0
 def __init__(self,
              train,
              test,
              id_column,
              y_column_name,
              num_of_features_to_select=138):
     self.num_of_features_to_select = num_of_features_to_select
     self.y_column_name = y_column_name
     self.id_column = id_column
     self.number_of_train = train.shape[0]
     self.processed_data = ProcessedData(train, test, self.id_column,
                                         self.y_column_name)
     self.data = self.processed_data.preprocess_my_data(
         self.num_of_features_to_select)
     self.train = self.data[:self.number_of_train]
     self.test = self.data[self.number_of_train:]
     self.ts_id = self.test[self.id_column].reset_index()
     self.ytrain = self.train[self.y_column_name]
     self.xtrain = self.train.drop([self.id_column, self.y_column_name],
                                   axis=1)
     self.xtest = self.test.drop([self.id_column, self.y_column_name],
                                 axis=1)
     self.cv = CrossValidation(train, test, self.id_column,
                               self.y_column_name,
                               num_of_features_to_select)
     self.model_predict = ModelPrediction(train, test, self.id_column,
                                          self.y_column_name,
                                          num_of_features_to_select)
     self.model_optim = ModelOptimization(train, test, self.id_column,
                                          self.y_column_name,
                                          num_of_features_to_select)
     self.torch_nn = TorchModel(train, test, self.id_column,
                                self.y_column_name,
                                num_of_features_to_select)
Ejemplo n.º 8
0
    def test_delta_threshold_check_handles_negatives(self):
        """Check that the scoring continues if a list has a large negative
        delta - we are interested in abs() values"""
        __MAX_ITERATIONS = 10
        __THRESHOLD = 0.1
        __SUPRA_THRESHOLD = 2.0
        __SUPRA_NEGATIVE_THRESHOLD = -2.0
        __SUB_THRESHOLD = 0.0001
        __SUB_NEGATIVE_THRESHOLD = -0.0001
        __EXPECTED_ITERATIONS = 6

        # Create two mock entities
        ent1 = mock.create_autospec(Entity)
        ent2 = mock.create_autospec(Entity)

        # Create a mock EntityList. Set the calculate_new_weight attribute
        # to be another mock, then set a side_effect to fake out successive
        # return values. The first one is above threshold values and then we
        # drop below threshold for subsequent calls.
        entity_list1 = mock.create_autospec(EntityList)
        entity_list1.calculate_new_weight = Mock()
        entity_list1.calculate_new_weight.side_effect = [
            __SUPRA_THRESHOLD, __SUB_THRESHOLD, __SUB_THRESHOLD,
            __SUB_THRESHOLD, __SUB_THRESHOLD, __SUB_THRESHOLD, __SUB_THRESHOLD,
            __SUB_THRESHOLD
        ]
        # Repeat for a second mock EntityList
        # NOTE - the second list returns __SUPRA_NEGATIVE_THRESHOLD for five
        # iterations and then reports __SUB_NEGATIVE_THRESHOLD after that
        entity_list2 = mock.create_autospec(EntityList)
        entity_list2.calculate_new_weight = Mock()
        entity_list2.calculate_new_weight.side_effect = [
            __SUPRA_NEGATIVE_THRESHOLD, __SUPRA_NEGATIVE_THRESHOLD,
            __SUPRA_NEGATIVE_THRESHOLD, __SUPRA_NEGATIVE_THRESHOLD,
            __SUPRA_NEGATIVE_THRESHOLD, __SUB_NEGATIVE_THRESHOLD,
            __SUB_NEGATIVE_THRESHOLD, __SUB_NEGATIVE_THRESHOLD
        ]

        entities = [ent1, ent2]
        entity_lists = [entity_list1, entity_list2]

        tested_object = CrossValidation(entities, entity_lists, __THRESHOLD,
                                        __MAX_ITERATIONS)

        tested_object.run_analysis()

        expected_calls = list(itertools.repeat(call(), __EXPECTED_ITERATIONS))
        ent1.calculate_new_score.assert_has_calls(expected_calls)
        self.assertEqual(__EXPECTED_ITERATIONS,
                         ent1.calculate_new_score.call_count)
        ent2.calculate_new_score.assert_has_calls(expected_calls)
        self.assertEqual(__EXPECTED_ITERATIONS,
                         ent2.calculate_new_score.call_count)
        entity_list1.calculate_new_weight.assert_has_calls(expected_calls)
        self.assertEqual(__EXPECTED_ITERATIONS,
                         entity_list1.calculate_new_weight.call_count)
        entity_list2.calculate_new_weight.assert_has_calls(expected_calls)
        self.assertEqual(__EXPECTED_ITERATIONS,
                         entity_list2.calculate_new_weight.call_count)
Ejemplo n.º 9
0
def run_knn(points):
    """
    Runs knn with given set of data
    :param points: set of data
    """
    m = KNN(5)
    m.train(points)
    print(f'predicted class: {m.predict(points[0])}')
    print(f'true class: {points[0].label}')
    cv = CrossValidation()
    cv.run_cv(points, 10, m, accuracy_score)
Ejemplo n.º 10
0
class JaneMarketModel:
    def __init__(self,
                 train,
                 test,
                 id_column,
                 y_column_name,
                 num_of_features_to_select=138):
        self.num_of_features_to_select = num_of_features_to_select
        self.y_column_name = y_column_name
        self.id_column = id_column
        self.number_of_train = train.shape[0]
        self.processed_data = ProcessedData(train, test, self.id_column,
                                            self.y_column_name)
        self.data = self.processed_data.preprocess_my_data(
            self.num_of_features_to_select)
        self.train = self.data[:self.number_of_train]
        self.test = self.data[self.number_of_train:]
        self.ts_id = self.test[self.id_column].reset_index()
        self.ytrain = self.train[self.y_column_name]
        self.xtrain = self.train.drop([self.id_column, self.y_column_name],
                                      axis=1)
        self.xtest = self.test.drop([self.id_column, self.y_column_name],
                                    axis=1)
        self.cv = CrossValidation(train, test, self.id_column,
                                  self.y_column_name,
                                  num_of_features_to_select)
        self.model_predict = ModelPrediction(train, test, self.id_column,
                                             self.y_column_name,
                                             num_of_features_to_select)
        self.model_optim = ModelOptimization(train, test, self.id_column,
                                             self.y_column_name,
                                             num_of_features_to_select)
        self.torch_nn = TorchModel(train, test, self.id_column,
                                   self.y_column_name,
                                   num_of_features_to_select)

    def kfold_cross_validation(self):
        return self.cv.kfold_cross_validation()

    def show_kfold_cross_validation_result(self):
        return self.cv.show_kfold_cv_results()

    def model_optimization_training(self):
        return self.model_optim.model_random_optimization()

    def model_prediction(self):
        return self.model_predict.predict_output()

    def train_model_with_pytorch(self):
        return self.torch_nn.train_model_with_torch()

    def predict_model_with_pytorch(self):
        return self.torch_nn.model_prediction_pytorch()
Ejemplo n.º 11
0
    def test_can_set_callback(self):
        """Check that we can stuff a callback object into the relevant slot
        in our CrossValidation object"""
        test_object = CrossValidation(['fake'], ['fake'], 1, 1)

        fake_callback_object = mock.MagicMock()

        test_object.register_callback(POST_ITERATION_CALLBACK,
                                      fake_callback_object)
        post_iteration_callbacks = test_object.callbacks[
            POST_ITERATION_CALLBACK]
        self.assertTrue(post_iteration_callbacks)
        self.assertTrue(fake_callback_object in post_iteration_callbacks)
Ejemplo n.º 12
0
def ques_two(points):
    max_accuracy = 0
    best_k = 0
    for k in range(1, 31):
        m = KNN(k)
        m.train(points)
        cv = CrossValidation()
        # print("current k=", k ,"  ", end="")
        a = cv.run_cv(points, len(points), m, accuracy_score, False)
        if max_accuracy < a:
            max_accuracy = a
            best_k = k
    return best_k
Ejemplo n.º 13
0
def k_fold_cross_validation(points, k):
    """
    Runs a knn for a given k value on a set of data and each time with different fold
    :param points: set of data
    :param k: value for knn
    """
    folds = [2, 10, 20]
    print(f"K={k}")
    for fold in folds:
        a = KNN(k)
        a.train(points)
        cv = CrossValidation()
        print(f"{fold}-fold-cross-validation:")
        cv.run_cv(points, fold, a, accuracy_score, False, True)
Ejemplo n.º 14
0
def question_3(points, k):
    """
    question 3
    :param points: list of Point
    :param k: the best classifier for the given data, based on question 2
    """
    m = KNN(k)
    m.train(points)
    n_folds_list = [2, 10, 20]
    print("Question 3:")
    print("K=", k, sep="")
    for i in n_folds_list:
        print(i, "-fold-cross-validation:", sep="")
        cv = CrossValidation()
        cv.run_cv(points, i, m, accuracy_score, False, True)
Ejemplo n.º 15
0
 def evaluate_linear_naive(self, i=None):
     if i is None:
         load = True
     elif type(i) is int:
         load = i
     else:
         raise Exception("Invalid i")
     
     # Evaluate naive hindex / linear in time predictor performance
     for target, tasks in self.targets.items():
         net = self.create_net(target)
         
         with CrossValidation(net) as cv:
             for _ in cv(num=self.num_cv, load=load, load_to_db=True):
                
                 # Trick to use Task logic for naive hindex/linear in time
                 # predictors for both Net and TimeSeriesNet
                 orig_evaluate = net.evaluate
                 orig_suffix = net.suffix
                
                 # naive h-index predictor
                 net.evaluate = net.hindex_evaluate
                 net.suffix = orig_suffix + '-naive'
                 with PredictivityOverTime(net, epochs_steps=0) as task:
                     task.evaluate_all()
                     
                 # linear in time predictor
                 net.evaluate = net.linear_in_time_predictor
                 net.suffix = orig_suffix + '-linear'
                 with PredictivityOverTime(net, epochs_steps=0) as task:
                     task.evaluate_all()
                 
                 # Revert trick
                 net.evaluate = orig_evaluate
                 net.suffix = orig_suffix
Ejemplo n.º 16
0
def knn_n_fold(k, n, points, normal_type):
    """
    run knn with n folds with normalized points
    :param k: k-nn
    :param n: n folds
    :param points: the points to use
    :param normal_type: the normalization of those points
    :return:
    """
    m = KNN(k)
    cv = CrossValidation()
    cv.run_cv(normal_type(points),
              n,
              m,
              accuracy_score,
              normal_type,
              print_fold_score=True)
Ejemplo n.º 17
0
def run_knn_k(points):
    """
    a function for question 2
    :param points: list of Point
    :return: a number, which is the best classifier for the given data
    """
    best_classifier = 0
    best_accuracy_score = 0.0
    for k in range(1, 31):
        m = KNN(k)
        m.train(points)
        cv = CrossValidation()
        current_accuracy = cv.run_cv(points, len(points), m, accuracy_score,
                                     False, False)
        if current_accuracy > best_accuracy_score:
            best_accuracy_score = current_accuracy
            best_classifier = k
    return best_classifier
Ejemplo n.º 18
0
def run_with_population(pop_size):
    # Set functions and terminals
    functions = [AddNode(), SubNode(), MulNode(), AnalyticQuotientNode()]  # chosen function nodes
    terminals = [EphemeralRandomConstantNode()]  # use one ephemeral random constant node

    # Run GP
    tuner = Tuner()
    sgp = SimpleGP(tuner=tuner, functions=functions, pop_size=pop_size, max_generations=100)

    CrossValidation(sgp, terminals).validate()
Ejemplo n.º 19
0
def question_3(points, k):
    m = KNN(k)
    m.train(points)
    cv = CrossValidation()
    print("Question 3:")
    print("K=" + str(k))
    print("2-fold-cross-validation:")
    cv.run_cv(points, 2, m, accuracy_score, False, True)
    print("10-fold-cross-validation:")
    cv.run_cv(points, 10, m, accuracy_score, False, True)
    print("20-fold-cross-validation:")
    cv.run_cv(points, 20, m, accuracy_score, False, True)
Ejemplo n.º 20
0
 def subsets_evaluate(self):
     for target, tasks in self.targets.items():
         net = self.create_net(target)
         with CrossValidation(net) as cv:
             # TODO: Can do this for each cv in an "evaluate" step or so and then
             # let the usual summarize*() do its thing / average / etc.?
             # For now only cv 0
             for _ in cv(load=[0], load_to_db=True):
                 for task in tasks(net):
                     task.subsets_evaluate()
Ejemplo n.º 21
0
    def test_stop_on_max_iterations(self):
        """If the EntityLists never get their delta down below the threshold
        then we are not converging so we continue to process. Eventually we
        need to stop after the specified number of iterations
        """
        __MAX_ITERATIONS = 10
        __THRESHOLD = 0.1
        __SUPRA_THRESHOLD = 2.0
        __EXPECTED_ITERATIONS = __MAX_ITERATIONS

        # Create two mock entities
        ent1 = mock.create_autospec(Entity)
        ent2 = mock.create_autospec(Entity)

        # Create a mock EntityList. Set the calculate_new_weight attribute
        # to be another mock, then set a side_effect to fake out successive
        # return values. All of them are above threshold values
        entity_list1 = mock.create_autospec(EntityList)
        entity_list1.calculate_new_weight = Mock()
        entity_list1.calculate_new_weight.side_effect = list(
            itertools.repeat(__SUPRA_THRESHOLD, __MAX_ITERATIONS))
        # Repeat for a second mock EntityList
        entity_list2 = mock.create_autospec(EntityList)
        entity_list2.calculate_new_weight = Mock()
        entity_list2.calculate_new_weight.side_effect = list(
            itertools.repeat(__SUPRA_THRESHOLD, __MAX_ITERATIONS))

        entities = [ent1, ent2]
        entity_lists = [entity_list1, entity_list2]

        tested_object = CrossValidation(entities, entity_lists, __THRESHOLD,
                                        __MAX_ITERATIONS)

        tested_object.run_analysis()

        expected_calls = list(itertools.repeat(call(), __MAX_ITERATIONS))
        ent1.calculate_new_score.assert_has_calls(expected_calls)
        self.assertEqual(__EXPECTED_ITERATIONS,
                         ent1.calculate_new_score.call_count)
        ent2.calculate_new_score.assert_has_calls(expected_calls)
        entity_list1.calculate_new_weight.assert_has_calls(expected_calls)
        entity_list2.calculate_new_weight.assert_has_calls(expected_calls)
Ejemplo n.º 22
0
def run_knn(points):
    #for k in range(1,31):
    #   m = KNN(k=k)
    #  m.train(points)
    # print(f'predicted class: {m.predict(points[0])}')
    #print(f'true class: {points[0].label}')
    #cv = CrossValidation()
    #cv.run_cv(points, len(points), m, accuracy_score,d.transform(points))
    print("Question 3:\nK=19")
    m = KNN(k=19)
    m.train(points)
    cv = CrossValidation()
    z = ZNormalizer()
    z.fit(points)
    d = DummyNormalizer()
    sum = SumNormalizer()
    min_max = MinMaxNormalizer()
    min_max.fit(points)
    print("2-fold-cross-validation:")
    cv.run_cv(points,
              2,
              m,
              accuracy_score,
              d.transform,
              print_final_score=False,
              print_fold_score=True)
    print("10-fold-cross-validation:")
    cv.run_cv(points,
              10,
              m,
              accuracy_score,
              d.transform,
              print_final_score=False,
              print_fold_score=True)
    print("20-fold-cross-validation:")
    cv.run_cv(points,
              20,
              m,
              accuracy_score,
              d.transform,
              print_final_score=False,
              print_fold_score=True)
    print("Question 4:\nK=5")
    knn_n_fold(5, 2, points, d.transform)
    knn_n_fold(5, 2, points, sum.l1)
    knn_n_fold(5, 2, points, min_max.transform)
    knn_n_fold(5, 2, points, z.transform)
    print("K=7")
    knn_n_fold(7, 2, points, d.transform)
    knn_n_fold(7, 2, points, sum.l1)
    knn_n_fold(7, 2, points, min_max.transform)
    knn_n_fold(7, 2, points, z.transform)
Ejemplo n.º 23
0
Archivo: main.py Proyecto: crab-a/lab4
def q3(k, points):
    m = KNN(k)
    m.train(points)
    cv = CrossValidation()

    print("Question 3:")
    print(f'K={k}')
    print("2-fold-cross-validation:")
    cv.run_cv(points, 2, m, accuracy_score, False, True)
    print("10-fold-cross-validation:")
    cv.run_cv(points, 10, m, accuracy_score, False, True)
    print("20-fold-cross-validation:")
    cv.run_cv(points, 20, m, accuracy_score, False, True)
Ejemplo n.º 24
0
    def test_one_list_never_hits_delta_threshold(self):
        """Check that the scoring continues for __MAX_ITERATIONS when one list
        fails to converge below __THRESHOLD.
        (i.e. all but one list are below the threshold but one remains
        stubbornly above and hence we continue to go around the loop."""
        __MAX_ITERATIONS = 10
        __THRESHOLD = 0.1
        __SUPRA_THRESHOLD = 2.0
        __SUB_THRESHOLD = 0.0001

        # Create two mock entities
        ent1 = mock.create_autospec(Entity)
        ent2 = mock.create_autospec(Entity)

        # Create a mock EntityList. Set the calculate_new_weight attribute
        # to be another mock, then set a side_effect to fake out successive
        # return values. All of them are above threshold values
        entity_list1 = mock.create_autospec(EntityList)
        entity_list1.calculate_new_weight = Mock()
        entity_list1.calculate_new_weight.side_effect = list(
            itertools.repeat(__SUPRA_THRESHOLD, __MAX_ITERATIONS))
        # Repeat for a second mock EntityList
        # NOTE - the second list returns __SUB_THRESHOLD
        entity_list2 = mock.create_autospec(EntityList)
        entity_list2.calculate_new_weight = Mock()
        entity_list2.calculate_new_weight.side_effect = list(
            itertools.repeat(__SUB_THRESHOLD, __MAX_ITERATIONS))

        entities = [ent1, ent2]
        entity_lists = [entity_list1, entity_list2]

        tested_object = CrossValidation(entities, entity_lists, __THRESHOLD,
                                        __MAX_ITERATIONS)

        tested_object.run_analysis()

        expected_calls = list(itertools.repeat(call(), __MAX_ITERATIONS))
        ent1.calculate_new_score.assert_has_calls(expected_calls)
        ent2.calculate_new_score.assert_has_calls(expected_calls)
        entity_list1.calculate_new_weight.assert_has_calls(expected_calls)
        entity_list2.calculate_new_weight.assert_has_calls(expected_calls)
Ejemplo n.º 25
0
    def test_summary_data(self):
        """Check that the CrossValidation object does the right thing when
        asked to calculate the summary data"""

        # make a couple of Entities
        ent1 = Entity("1")
        ent1.score = 1
        ent2 = Entity("2")
        ent2.score = 2
        ent3 = Entity("3")
        ent3.score = 3

        entities = [ent1, ent2, ent3]
        entity_lists = ["fool the assertions"]
        test_object = CrossValidation(entities, entity_lists, 0.001, 999)

        returned_details = test_object.summary_data()
        self.assertAlmostEqual(2.0, returned_details[0], 14,
                               "Unexpected average")
        self.assertAlmostEqual(0.816496580927726, returned_details[1], 14,
                               "Unexpected average")
Ejemplo n.º 26
0
def run_1_to_30_knn(points):
    """
    Runs knn with k=0 to k=30 on a given set of data
    :param points: set of data
    """
    k = 0
    accuracy = 0
    num_of_points = len(points)
    for index in range(1, 31):
        a = KNN(index)
        a.train(points)
        print(f"classifier {index}:")
        print(f'predicted class: {a.predict(points[0])}')
        print(f'true class: {points[0].label}')
        cv = CrossValidation()
        temp_score = cv.run_cv(points, num_of_points, a, accuracy_score)
        if temp_score > accuracy:
            accuracy = temp_score
            k = index
        print()
    print(f"best classifier is: {k}, best accuracy is: {accuracy}")
Ejemplo n.º 27
0
def ques_three(points):
    print("Question 3:")
    # best_k = ques_two(points)
    best_k = 19
    print("K={}".format(best_k))
    m = KNN(best_k)
    m.train(points)
    cv = CrossValidation()
    print("2-fold-cross-validation:")
    cv.run_cv(points, 2, m, accuracy_score, False, True)
    print("10-fold-cross-validation:")
    cv.run_cv(points, 10, m, accuracy_score, False, True)
    print("20-fold-cross-validation:")
    cv.run_cv(points, 20, m, accuracy_score, False, True)
Ejemplo n.º 28
0
def two_fold_cross_validation(points):
    """
    Runs two fold cross validation on specific k values and each time test another norm
    :param points: set of data
    """
    knns = [5, 7]
    norms = [DummyNormalizer, SumNormalizer, MinMaxNormalizer, ZNormalizer]
    prints = 0
    for knn in knns:
        print(f"K={knn}")
        for norm in norms:
            a = KNN(knn)
            nor = norm()
            nor.fit(points)
            temp_points = nor.transform(points)
            a.train(temp_points)
            cv = CrossValidation()
            accuracy = cv.run_cv(temp_points, 2, a, accuracy_score, True, True)
            print(f"Accuracy of {norm.__name__} is {accuracy}")
            prints += 1
            if prints != len(knns) * len(norms):
                print()
Ejemplo n.º 29
0
def run_with_range(range_settings):
    # Set functions and terminals
    functions = [AddNode(), SubNode(), MulNode(), AnalyticQuotientNode()]  # chosen function nodes
    terminals = [EphemeralRandomConstantNode()]  # use one ephemeral random constant node

    # Run GP
    tuner = Tuner(
        scale_range=(range_settings[0], range_settings[1]),
        translation_range=(range_settings[0], range_settings[1]),
        run_generations=(range(0, 20))
    )
    sgp = SimpleGP(tuner=tuner, functions=functions, pop_size=100, max_generations=20)

    CrossValidation(sgp, terminals).validate()
Ejemplo n.º 30
0
def question_4(points, normalizers):
    print("Question 4:")
    m = KNN(5)
    m.train(points)
    cv = CrossValidation()
    print("K=5")
    for key in normalizers.keys():
        norm = normalizers.get(key)
        n = norm()
        n.fit(points)
        new_points = n.transform(points)
        print(f"Accuracy of {key} is " + str(cv.run_cv(new_points, 2, m, accuracy_score, False, True)))
        print("")
    m = KNN(7)
    m.train(points)
    print("K=7")
    for key in normalizers.keys():
        norm = normalizers.get(key)
        n = norm()
        n.fit(points)
        new_points = n.transform(points)
        print(f"Accuracy of {key} is " + str(cv.run_cv(new_points, 2, m, accuracy_score, False, True)))
        print("")
def runExperiment():
    with open("data_banknote_authentication.txt", "r") as dataset:
        bankNoteData = removeAttrsRandom(np.asarray([line.split(",") for line in dataset.readlines()], dtype=np.float64))
        numFolds = 10
        crossVal = CrossValidation(bankNoteData, numFolds)
        for fold in range(crossVal.folds):
            print("\nRunning Fold : " + str(fold + 1))
            trainingData = crossVal.getTrainingData(fold)
            testingData = crossVal.getTestingData(fold)
            attrIndices = [i for i in range(bankNoteData.shape[1] - 1)]
            decisionTree = DecisionTree(bankNoteData, trainingData, attrIndices)
            makeDecisionTree(bankNoteData, decisionTree.root, decisionTree.root.allowedAttrs, None)
            outputs = [int(bankNoteData[item, 4]) for item in testingData]
            crossVal.addOutputs(outputs)
            queries = [bankNoteData[item, :4] for item in testingData]
            predictedOutputs = [classifyQuery([query], decisionTree.root) for query in queries]
            crossVal.addPredictedOutputs(predictedOutputs)
            print("OUTPUTS = " + str(outputs))
            print("PREDICTED OUTPUTS = " + str(predictedOutputs))
        
        crossVal.printConfusionMatrix()
        
    print("ACCURACY  = " + str(crossVal.getAverageAccuracy() * 100) + "%")
    print("PRECISION = " + str(crossVal.getAveragePrecision() * 100) + "%")
    print("RECALL    = " + str(crossVal.getAverageRecall() * 100) + "%")
def runExperiment():
    with open("data_banknote_authentication.txt", "r") as dataset:
        bankNoteData = np.asarray([line.split(",") for line in dataset.readlines()], dtype=np.float64)

        numFolds = 10
        crossVal = CrossValidation(bankNoteData, numFolds)
        for fold in range(crossVal.folds):
            print("\nRunning Fold : " + str(fold + 1))
            trainingData = crossVal.getTrainingData(fold)
            testingData = crossVal.getTestingData(fold)
            outputs = [int(bankNoteData[item, 4]) for item in testingData]
            crossVal.addOutputs(outputs)
            randomForest = buildRandomForest(bankNoteData, trainingData)
            queries = [bankNoteData[item, :4] for item in testingData]
            predictedOutputs = [classifyQuery(randomForest, query) for query in queries]
            crossVal.addPredictedOutputs(predictedOutputs)
            print("OUTPUTS = " + str(outputs))
            print("PREDICTED OUTPUTS = " + str(predictedOutputs))

        crossVal.printConfusionMatrix()

    print("ACCURACY  = " + str(crossVal.getAverageAccuracy() * 100) + "%")
    print("PRECISION = " + str(crossVal.getAveragePrecision() * 100) + "%")
    print("RECALL    = " + str(crossVal.getAverageRecall() * 100) + "%")
# from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import numpy as np

from feature_engineering import FeatureEngineering
from cross_validation import CrossValidation
from multi_log_loss import multi_log_loss

f = FeatureEngineering('../data/gender_age_train.csv',
                       '../data/gender_age_test.csv',
                       'device_id',
                       wide_files=[#'../features/apps_per_event.csv', '../features/avg_position.csv',
                                   #'../features/count_by_hour.csv', '../features/count_by_period.csv',
                                   '../features/event_counts.csv', '../features/sd_position.csv'],
                       long_files=[#'../features/active_app_category_counts.csv',
                                   #'../features/installed_app_category_counts.csv',
                                   '../features/phone_brand.csv'])
labels, features, colnames = f.extract_features()
labels.set_index(np.arange(labels.shape[0]), inplace=True)
colnames.set_index(np.arange(colnames.shape[0]), inplace=True)
train_filter = [i for i, x in enumerate(labels['age'].tolist()) if not np.isnan(x)]
test_filter = [i for i, x in enumerate(labels['age'].tolist()) if np.isnan(x)]

cv = CrossValidation(features[train_filter, :],
                     labels.ix[train_filter, 'group'],
                     features[test_filter, :],
                     multi_log_loss)
model = MultinomialNB()
model.predict = model.predict_proba
out = cv.run(model, 'test')