Example #1
0
def evaluate(params, stream, study_size, metrics=['accuracy', 'kappa']):
    clf = ARSLVQ(gamma=params[1],
                 sigma=params[2],
                 prototypes_per_class=int(params[3]),
                 confidence=params[4])
    stream.prepare_for_use()
    evaluator = EvaluatePrequential(show_plot=False,
                                    batch_size=10,
                                    max_samples=study_size,
                                    metrics=metrics)

    model = evaluator.evaluate(stream=stream, model=clf)

    print(evaluator.get_mean_measurements())
    return list(params) + (evaluator._data_buffer.get_data(
        metric_id="accuracy", data_id="mean"))
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y.astype(np.int))

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNADWINClassifier(n_neighbors=2,
                                    max_window_size=50,
                                    leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer),
                     ('KNNADWINClassifier', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].accuracy_score())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].kappa_score())
    print(pipe.get_info())
    expected_info = "Pipeline: [OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], " \
                    "[5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], " \
                    "[20, 21, 22, 23, 24]]) KNNADWINClassifier(leaf_size=40, " \
                    "max_window_size=50, metric='euclidean', n_neighbors=2)]"
    info = " ".join([line.strip() for line in pipe.get_info().split()])
    assert info == expected_info
def test_pipeline(test_path):
    n_categories = 5

    # Load test data generated using:
    # RandomTreeGenerator(tree_random_state=1, sample_random_state=1,
    #                     n_cat_features=n_categories, n_num_features=0)
    test_file = os.path.join(test_path, 'data-one-hot.npz')
    data = np.load(test_file)
    X = data['X']
    y = data['y']
    stream = DataStream(data=X, y=y)
    stream.prepare_for_use()

    # Setup transformer
    cat_att_idx = [[i + j for i in range(n_categories)]
                   for j in range(0, n_categories * n_categories, n_categories)
                   ]
    transformer = OneHotToCategorical(categorical_list=cat_att_idx)

    # Set up the classifier
    classifier = KNNAdwin(n_neighbors=2, max_window_size=50, leaf_size=40)
    # Setup the pipeline
    pipe = Pipeline([('one-hot', transformer), ('KNNAdwin', classifier)])
    # Setup the evaluator
    evaluator = EvaluatePrequential(show_plot=False,
                                    pretrain_size=10,
                                    max_samples=100)
    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)

    metrics = evaluator.get_mean_measurements()

    expected_accuracy = 0.5555555555555556
    assert np.isclose(expected_accuracy, metrics[0].get_accuracy())

    expected_kappa = 0.11111111111111116
    assert np.isclose(expected_kappa, metrics[0].get_kappa())
    print(pipe.get_info())
    expected_info = "Pipeline:\n" \
                    "[OneHotToCategorical(categorical_list=[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9],\n" \
                    "                                      [10, 11, 12, 13, 14],\n" \
                    "                                      [15, 16, 17, 18, 19],\n" \
                    "                                      [20, 21, 22, 23, 24]])\n" \
                    "KNNAdwin(leaf_size=40, max_window_size=50, n_neighbors=2,\n" \
                    "         nominal_attributes=None)]"
    assert pipe.get_info() == expected_info
Example #4
0
def train(name, clusters, window, normalize=False):
    input_csv = '{}{}_clusters={}_window={}_prepared.csv'.format(
        DATA_LOCATION, name, clusters, window)
    data = pd.read_csv(input_csv, index_col=0)

    if normalize:
        states = data.filter(['current_state', 'next_state'])
        sensors = data.drop(columns=['current_state', 'next_state'])
        scaler = StandardScaler()
        data = pd.DataFrame(data=scaler.fit_transform(X=sensors),
                            index=data.index,
                            columns=sensors.columns)
        data = pd.concat([data, states], axis='columns')

    stream = DataStream(data)

    hf = HoeffdingTreeClassifier()
    sgd = SGDClassifier()

    evaluator = EvaluatePrequential()
    evaluator.evaluate(stream=stream, model=[hf, sgd])
    # print('---------------------------------------------')
    # measurements = evaluator.get_mean_measurements()[0]
    # print(measurements.confusion_matrix)
    # print(measurements.accuracy_score())
    data = []
    for i, measurements in enumerate(evaluator.get_mean_measurements()):
        data.append([
            name, clusters, window, MODEL_NAMES[i], normalize,
            measurements.accuracy_score(),
            measurements.precision_score(),
            measurements.recall_score(),
            measurements.f1_score()
        ])
    return pd.DataFrame(data=data,
                        columns=[
                            'name', 'clusters', 'window', 'model',
                            'normalized', 'accuracy', 'precision', 'recall',
                            'f1'
                        ])
Example #5
0
        return X

#Read the stream 
stream = FileStream("C:/Users/jeffr/OneDrive/Desktop/Data Stream/Assignment_One/dataset/data_n30000.csv")
stream.prepare_for_use()

#stream.next_sample(10)
#stream.n_remaining_samples()
#X, y = stream.next_sample(5000)

metrics = ['accuracy', 'kappa', 'kappa_m', 'kappa_t', 'running_time', 'model_size'] 
evaluator = EvaluatePrequential(max_samples = 30000, n_wait = 100, show_plot = True, metrics = metrics) 

my_knn = MyKNNClassifier(standardize = True, weighted_vote = False)
evaluator.evaluate(stream = stream, model = [my_knn], model_names = ['My_KNN'])
cm = evaluator.get_mean_measurements(0).confusion_matrix
print("Recall per class")
for i in range(cm.n_classes):
    recall = cm.data[(i,i)]/cm.sum_col[i] \
    if cm.sum_col[i] != 0 else 'Ill-defined'
    print("Class {}: {}".format(i, recall))

#All the methods that we need to test
knn = KNNClassifier()
ht = HoeffdingTreeClassifier(leaf_prediction = 'mc')
htnb = HoeffdingTreeClassifier(leaf_prediction = 'nb')
nb = NaiveBayes()
hoef = HoeffdingTreeClassifier()

#Evaluating all methods together
evaluator.evaluate(stream = stream, model = [knn, ht, htnb, nb, hoef], model_names = ['KNN', 'HTMC', 'HTNB', 'NB', 'HT'])
Example #6
0
def custom_evaluation(datastreams, clfs, stream_length, Prequential=False):

    eval_results = []
    eval_time = 0
    eval_time = 0
    eval_acc = 0
    eval_kappa = 0
    eval_kappam = 0
    eval_kappat = 0
    ev = ['Holdout', 'Prequential']
    mod = clfs[0]
    resultpath = ""
    rdf = []

    stream = datastreams[0]
    stream.prepare_for_use()
    #print(stream.get_data_info())
    #print(datastream_names[index])

    if Prequential == True:
        resultpath = "results/Prequential/" + ev[1] + "_" + datastreams[
            1] + "_" + clfs[1] + ".csv"
        evaluator = EvaluatePrequential(max_samples=stream_length,
                                        metrics=[
                                            'accuracy', 'kappa', 'kappa_t',
                                            'kappa_m', 'running_time'
                                        ])
        eval_text = ev[1]
    else:
        resultpath = "results/Holdout/" + ev[0] + "_" + datastreams[
            1] + "_" + clfs[1] + ".csv"
        evaluator = EvaluateHoldout(max_samples=stream_length,
                                    metrics=[
                                        'accuracy', 'kappa', 'kappa_t',
                                        'kappa_m', 'running_time'
                                    ])
        eval_text = ev[0]

    print('')
    print(eval_text + ' evaluation for ' + datastreams[1] + ' stream:')
    try:
        evaluator.evaluate(stream=stream, model=mod)

        eval_results.append(evaluator.get_mean_measurements())
        eval_time = evaluator.running_time_measurements[0]._total_time

        for i, item in enumerate(eval_results, start=0):
            eval_acc = item[0].get_accuracy()
            eval_kappa = item[0].get_kappa()
            eval_kappam = item[0].get_kappa_m()
            eval_kappat = item[0].get_kappa_t()
    except Exception as e:
        print(e)

    print('')
    print(eval_text + ' evaluation for ' + datastreams[1] + ' stream finished')

    #try:
    #    evaluator.evaluate(stream=stream, model=mod)

    #    eval_results_prequel.append(evaluator.get_mean_measurements())
    #except Exception as e:
    #    print(e)

    #print('')
    #print('Prequential evaluation for '+datastreams[1]+' stream finished')

    print('')
    print('Results for the ' + eval_text + ' eval:')
    print('')
    print(clfs[1] + ' :')
    print('Accuracy: ' + str(round(eval_acc, 4)))
    print('Kappa: ' + str(round(eval_kappa, 4)))
    print('Kappa_m: ' + str(round(eval_kappam, 4)))
    print('Kappa_t: ' + str(round(eval_kappat, 4)))
    print('Total comp. time: ' + str(round(eval_time, 2)))

    try:
        #create dataframe with the ruslts for the datastream and the now active clf and save it as csv
        rdf_data = [[
            datastreams[1], clfs[1],
            str(round(eval_acc, 4)),
            str(round(eval_kappa, 4)),
            str(round(eval_kappam, 4)),
            str(round(eval_kappat, 4)),
            str(round(eval_time, 2))
        ]]
        rdf = pd.DataFrame(rdf_data,
                           columns=[
                               'Stream', 'Clf', 'Accuracy', 'Kappa', 'Kappa_m',
                               'Kappa_t', 'total comp. time'
                           ])
        rdf.to_csv(resultpath, index=None, header=True)
    except Exception as e:
        print(e)

    #print('Total comp. time: '+ str(round(item[j].get_kappa_t(), 4)))

    print('')