def run(train_sample, train_label, test_sample, test_label, k):
    train_sample, train_sample_size = Load.loadSample(train_sample)
    train_label, train_label_size = Load.loadLabel(train_label)
    assert train_sample_size == train_label_size, 'train_sample_size does not match train_label_size'

    test_sample, test_sample_size = Load.loadSample(test_sample)
    test_label, test_label_size = Load.loadLabel(test_label)
    assert test_sample_size == test_label_size, 'test_sample_size does not match test_label_size'

    train_sample = Preprocess.normalize(train_sample).values.tolist()  # list
    test_sample = Preprocess.normalize(test_sample).values.tolist()  # list

    label_to_index = {
        label: index
        for index, label in enumerate(set(train_label['x'].tolist()))
    }
    train_index = Preprocess.labelMap(train_label, label_to_index)  # list
    test_index = Preprocess.labelMap(test_label, label_to_index)  # list

    correct_count = 0

    for i, one in enumerate(test_sample):
        euclid_dist = np.linalg.norm(np.array(one) - np.array(train_sample),
                                     axis=1)
        nn_idx = euclid_dist.argsort()[:k]

        nn_vote = []
        nn_decision = 0
        for idx in nn_idx:
            nn_vote.append(train_index[idx])  # for there are only 1 or 0
        if sum(nn_vote) > k / 2:
            # print(list(label_to_index.keys())[1])
            nn_decision = 1
        else:
            # print(list(label_to_index.keys())[0])
            nn_decision = 0
        # print(test_label.values.tolist()[i][0])
        if test_label.values.tolist()[i][0] == list(
                label_to_index.keys())[nn_decision]:
            # right
            correct_count += 1
    test_correct = correct_count / test_sample_size
    Log.log(filename, 'k: {}; correct rate: {}\n'.format(k, test_correct))
    return test_correct
Esempio n. 2
0
def run(filename, train_sample, train_label, test_sample, test_label, title, M,
        thresh, CART_step):
    train_sample, train_sample_size = Load.loadSample(train_sample)
    train_label, train_label_size = Load.loadLabel(train_label)
    assert train_sample_size == train_label_size, 'train_sample_size does not match train_label_size'

    test_sample, test_sample_size = Load.loadSample(test_sample)
    test_label, test_label_size = Load.loadLabel(test_label)
    assert test_sample_size == test_label_size, 'test_sample_size does not match test_label_size'

    train_sample = Preprocess.normalize(train_sample,
                                        True).values.tolist()  # list
    test_sample = Preprocess.normalize(test_sample,
                                       True).values.tolist()  # list

    label_to_index = {
        label: index
        for index, label in enumerate(set(train_label['x'].tolist()))
    }
    train_index = Preprocess.labelMap(train_label, label_to_index)  # list
    test_index = Preprocess.labelMap(test_label, label_to_index)  # list

    input_size = len(train_sample[0])
    sample_size = len(train_sample)
    sample_weights = [1 / sample_size for _ in range(sample_size)]
    classifier_weights = []
    classifier_thresholds = []
    threshold_positions = []
    test_corrs = []
    test_times = [i + 1 for i in range(M)]

    for i in range(M):
        threshold, position, errors = Calc.CART(train_sample, train_index,
                                                sample_weights, thresh,
                                                CART_step)
        total_error = Calc.gentleError(np.array(sample_weights),
                                       np.array(errors))
        classifier_weights.append(round(Calc.classifierError(total_error), 3))
        classifier_thresholds.append(threshold)
        threshold_positions.append(position)
        sample_weights = Calc.updateVariableWeights(np.array(sample_weights),
                                                    total_error, errors)
        # print('errors: {}'.format(errors))
        # print('sample_weights: {}'.format(sample_weights))
        # print('classifier_threshold: {} in {}'.format(threshold, position))
        print('total_error: {}'.format(total_error))
        print('threshold_positions:   {}'.format(threshold_positions))
        print('classifier_thresholds: {}'.format(classifier_thresholds))
        print('classifier_weights:    {}'.format(classifier_weights))

        test_corr = 0
        test_size = len(test_sample)
        for sample, index in zip(test_sample, test_index):
            vote = 0
            for threshold, position, weight in zip(classifier_thresholds,
                                                   threshold_positions,
                                                   classifier_weights):
                if sample[position] >= threshold:
                    vote += weight
                elif sample[position] < threshold:
                    vote -= weight
            if vote >= 0 and index == 1:
                test_corr += 1
            elif vote < 0 and index == 0:
                test_corr += 1
        test_corrs.append(round(test_corr / test_size, 3))
        Log.log(filename, 'M: {}; correction: {}\n'.format(M, test_corrs[-1]))
        print(
            '-----------------thresh: {}; CART_step: {}; iter: {}-----------------'
            .format(thresh, CART_step, i + 1))

    Graph.draw(filename, test_times, test_corrs, test_times[-1], 1.0, title)
    return test_corrs