Exemple #1
0
# Label prediction accuracy setup
unlabeled_truth = np.insert(unlabeled_data, 0, unlabeled_labels, axis=1)
unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth])

# Cotraining label prediction accuracy
avg_nn_pred_confidence.append(cotrain_model.avg_nn_pred_confidence)
avg_svm_pred_confidence.append(cotrain_model.avg_svm_pred_confidence)
cotrain_unlabeled_predictions = cotrain_model.get_unlabeled_predictions()
cotrain_unlabeled_predictions_set = set([tuple(x) for x in cotrain_unlabeled_predictions])
cotrain_correct_matches = np.array([x for x in cotrain_unlabeled_predictions_set & unlabeled_truth_set])
cotrain_label_acc = len(cotrain_correct_matches) / len(cotrain_unlabeled_predictions)
print('cotraining labeling accuracy: %s' % cotrain_label_acc)
print()

cotrain_unpredicted_data = cotrain_model.get_unpredicted_data()
kmeans = SemiSupervisedKMeans(num_clusters=2)
kmeans.initialize(training_data, training_labels)
kmeans.fit(unlabeled_data, 4500)

# Clustering prediction accuracy
cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions()
cluster_unlabeled_predictions_set = set([tuple(x) for x in cluster_unlabeled_predictions])
cluster_correct_matches = np.array([x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set])
cluster_label_acc = len(cluster_correct_matches) / len(cluster_unlabeled_predictions)
print('clustering labeling accuracy: %s' % cluster_label_acc)
print()

labeled_data = np.insert(training_data, 0, training_labels, axis=1)
remaining_unlabeled_predictions = kmeans.predict(cotrain_unpredicted_data, 4500)
complete_training_data = np.vstack([labeled_data, cotrain_unlabeled_predictions, remaining_unlabeled_predictions])
Exemple #2
0
f = open(labeled_path)
u = open(unlabeled_path)
df = pd.read_csv(f)
udf = pd.read_csv(u)

total_count = df.shape[0]
train_count = int(total_count)

training_data = df.values
training_labels = training_data[:, 0]
training_data = np.delete(training_data, 0, 1)
unlabeled_data = udf.values
unlabeled_labels = unlabeled_data[:, 0]
unlabeled_data = np.delete(unlabeled_data, 0, 1)

kmeans = SemiSupervisedKMeans(num_clusters=2)
kmeans.initialize(training_data, training_labels)
# kmeans.fit(unlabeled_data, 3490)
# kmeans.fit(unlabeled_data, 4500) #39
kmeans.fit(unlabeled_data, 4500)

# Label prediction accuracy setup
unlabeled_truth = np.insert(unlabeled_data, 0, unlabeled_labels, axis=1)
unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth])

# Clustering prediction accuracy
cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions()
cluster_unlabeled_predictions_set = set(
    [tuple(x) for x in cluster_unlabeled_predictions])
cluster_correct_matches = np.array(
    [x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set])