# Label prediction accuracy setup unlabeled_truth = np.insert(unlabeled_data, 0, unlabeled_labels, axis=1) unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth]) # Cotraining label prediction accuracy avg_nn_pred_confidence.append(cotrain_model.avg_nn_pred_confidence) avg_svm_pred_confidence.append(cotrain_model.avg_svm_pred_confidence) cotrain_unlabeled_predictions = cotrain_model.get_unlabeled_predictions() cotrain_unlabeled_predictions_set = set([tuple(x) for x in cotrain_unlabeled_predictions]) cotrain_correct_matches = np.array([x for x in cotrain_unlabeled_predictions_set & unlabeled_truth_set]) cotrain_label_acc = len(cotrain_correct_matches) / len(cotrain_unlabeled_predictions) print('cotraining labeling accuracy: %s' % cotrain_label_acc) print() cotrain_unpredicted_data = cotrain_model.get_unpredicted_data() kmeans = SemiSupervisedKMeans(num_clusters=2) kmeans.initialize(training_data, training_labels) kmeans.fit(unlabeled_data, 4500) # Clustering prediction accuracy cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions() cluster_unlabeled_predictions_set = set([tuple(x) for x in cluster_unlabeled_predictions]) cluster_correct_matches = np.array([x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set]) cluster_label_acc = len(cluster_correct_matches) / len(cluster_unlabeled_predictions) print('clustering labeling accuracy: %s' % cluster_label_acc) print() labeled_data = np.insert(training_data, 0, training_labels, axis=1) remaining_unlabeled_predictions = kmeans.predict(cotrain_unpredicted_data, 4500) complete_training_data = np.vstack([labeled_data, cotrain_unlabeled_predictions, remaining_unlabeled_predictions])
f = open(labeled_path) u = open(unlabeled_path) df = pd.read_csv(f) udf = pd.read_csv(u) total_count = df.shape[0] train_count = int(total_count) training_data = df.values training_labels = training_data[:, 0] training_data = np.delete(training_data, 0, 1) unlabeled_data = udf.values unlabeled_labels = unlabeled_data[:, 0] unlabeled_data = np.delete(unlabeled_data, 0, 1) kmeans = SemiSupervisedKMeans(num_clusters=2) kmeans.initialize(training_data, training_labels) # kmeans.fit(unlabeled_data, 3490) # kmeans.fit(unlabeled_data, 4500) #39 kmeans.fit(unlabeled_data, 4500) # Label prediction accuracy setup unlabeled_truth = np.insert(unlabeled_data, 0, unlabeled_labels, axis=1) unlabeled_truth_set = set([tuple(x) for x in unlabeled_truth]) # Clustering prediction accuracy cluster_unlabeled_predictions = kmeans.get_unlabeled_predictions() cluster_unlabeled_predictions_set = set( [tuple(x) for x in cluster_unlabeled_predictions]) cluster_correct_matches = np.array( [x for x in cluster_unlabeled_predictions_set & unlabeled_truth_set])