Exemple #1
0
unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth])

# Cotraining label prediction accuracy
avg_nn_pred_confidence.append(cotrain_model.avg_nn_pred_confidence)
avg_svm_pred_confidence.append(cotrain_model.avg_svm_pred_confidence)
cotrain_unlabeled_predictions = cotrain_model.get_unlabeled_predictions()
cotrain_unlabeled_predictions_set = set([tuple(x) for x in cotrain_unlabeled_predictions])
cotrain_correct_matches = np.array([x for x in cotrain_unlabeled_predictions_set & unlabeled_truth_set])
cotrain_label_acc = len(cotrain_correct_matches) / len(cotrain_unlabeled_predictions)
print('cotraining labeling accuracy: %s' % cotrain_label_acc)
print()

cotrain_unpredicted_data = cotrain_model.get_unpredicted_data()
kmeans = SemiSupervisedKMeans(num_clusters=2)
kmeans.initialize(training_data, training_labels)
kmeans.fit(unlabeled_data, 4500)

# Clustering prediction accuracy
cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions()
cluster_unlabeled_predictions_set = set([tuple(x) for x in cluster_unlabeled_predictions])
cluster_correct_matches = np.array([x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set])
cluster_label_acc = len(cluster_correct_matches) / len(cluster_unlabeled_predictions)
print('clustering labeling accuracy: %s' % cluster_label_acc)
print()

labeled_data = np.insert(training_data, 0, training_labels, axis=1)
remaining_unlabeled_predictions = kmeans.predict(cotrain_unpredicted_data, 4500)
complete_training_data = np.vstack([labeled_data, cotrain_unlabeled_predictions, remaining_unlabeled_predictions])

# Remaining data labeling accuracy
remaining_unlabeled_predictions_set = set([tuple(x) for x in remaining_unlabeled_predictions])
Exemple #2
0
sample = 0

kfold = StratifiedKFold(n_splits=10, shuffle=True)

while sample < total_runs:
    try:
        training_data = df.values
        training_labels = training_data[:, 0]
        training_data = np.delete(training_data, 0, 1)
        unlabeled_data = udf.values
        unlabeled_labels = unlabeled_data[:, 0]
        unlabeled_data = np.delete(unlabeled_data, 0, 1)

        kmeans = SemiSupervisedKMeans(num_clusters=2)
        kmeans.initialize(training_data, training_labels)
        kmeans.fit(unlabeled_data, 4)

        # Label prediction accuracy setup
        unlabeled_truth = np.insert(unlabeled_data,
                                    0,
                                    unlabeled_labels,
                                    axis=1)
        unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth])

        # Clustering prediction accuracy
        cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions()
        cluster_unlabeled_predictions_set = set(
            [tuple(x) for x in cluster_unlabeled_predictions])
        cluster_correct_matches = np.array([
            x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set
        ])