def test_pairs(self): database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv') labels = fast_strong_cluster(database) pair_seed = generate_pair_seed(database, labels, 0.5) # x1_a, x2_a, m_a = _get_pairs(database, labels, 10, balancing=True) # x1_b, x2_b, m_b = _get_pairs(database, labels, 10, balancing=True) # self.assertNotEqual(x1_a, x1_b) # self.assertNotEqual(x2_a, x2_b) # self.assertNotEqual(m_a, m_b) x1_a, x2_a, m_a = get_pairwise_features(database, labels, pair_seed) x1_b, x2_b, m_b = get_pairwise_features(database, labels, pair_seed) np.testing.assert_array_equal(x1_a, x1_b) np.testing.assert_array_equal(x2_a, x2_b) np.testing.assert_array_equal(m_a, m_b)
def test(self, database_test, labels_test, pair_seed): """ Get testing samples and test the surrogate match function. Evaluated with ROC curve :param database_test: RecordDatabase object :param labels_test: Corresponding labels, of dict form [record_id, label] :param pair_seed: List of pairs, where each pair is a tuple of form (identifierA, identifierB) :return RocCurve: An RocCurve object """ y_test, x_test, _ = get_pairwise_features(database_test, labels_test, pair_seed) self.class_balance_validation = float(sum(y_test)) / len(y_test) x_bar_probability = self._classifier.predict_proba(x_test)[:, 1] #output = np.column_stack((x1_test, x1_bar_probability)) #np.savetxt('roc_labels.csv', output, delimiter=",", header='label,probability', fmt='%.1i,%5.5f') roc = RocCurve(y_test, x_bar_probability) self.roc = roc sorted_indices = np.argsort(-1 * x_bar_probability) for sorted_index in sorted_indices: x1_bar = x_bar_probability[sorted_index] pair = pair_seed[sorted_index] print 'Test pair P(match) = ', x1_bar database_test.records[pair[0]].display(indent=' ') print ' ----' database_test.records[pair[1]].display(indent=' ') return roc
def test(self, database_test, labels_test, pair_seed): """ Get testing samples and test the surrogate match function. Evaluated with ROC curve :param database_test: RecordDatabase object :param labels_test: Corresponding labels, of dict form [record_id, label] :param pair_seed: List of pairs, where each pair is a tuple of form (identifierA, identifierB) :return RocCurve: An RocCurve object """ y_test, x_test, _ = get_pairwise_features(database_test, labels_test, pair_seed) self.class_balance_validation = float(sum(y_test))/len(y_test) # Save to text file # Run executable # Load csv file x_bar_probability = self._classifier.predict_proba(x_test)[:, 1] #output = np.column_stack((x1_test, x1_bar_probability)) #np.savetxt('roc_labels.csv', output, delimiter=",", header='label,probability', fmt='%.1i,%5.5f') roc = RocCurve(y_test, x_bar_probability) self.roc = roc sorted_indices = np.argsort(-1*x_bar_probability) for sorted_index in sorted_indices: x1_bar = x_bar_probability[sorted_index] pair = pair_seed[sorted_index] print 'Test pair P(match) = ', x1_bar database_test.records[pair[0]].display(indent=' ') print ' ----' database_test.records[pair[1]].display(indent=' ') return roc
def _train(self, database_train, labels_train, pair_seed): """ Gets training samples and trains the surrogate match function :param database_train: Training database :param labels_train: Dictionary of [identier, cluster label] :param pair_seed: List of pairs to use :return x2_mean: Mean feature values used in imputation (i.e. to fill in missing features) """ x1_train, x2_train, x2_mean = get_pairwise_features( database_train, labels_train, pair_seed) bounds = list() for _ in range(x2_train.shape[1]): bounds.append(( None, 0)) # restrict all feature weights to be negative (convex set) bounds.append((None, None)) # no bounds for the constant offset term print 'Training Logistic Regression pairwise match function...' print ' Pos/Neg training sample class split: ', sum( x1_train), '/', len(x1_train) - sum(x1_train) print ' Enforcing weight vector bounds:' for counter, bound in enumerate(bounds): print ' Feature', counter, ': ', bounds print ' (last term is intercept)' self._logreg.bounds = bounds self._logreg.fit(x2_train, x1_train) print 'Model coefficients: ', self._logreg.coef_ print 'Intercept: ', self._logreg.intercept_ return x2_mean
def _train(self, database_train, labels_train, pair_seed): """ Gets training samples and trains the surrogate match function :param database_train: Training database :param labels_train: Dictionary of [identier, cluster label] :param pair_seed: List of pairs to use :return x_mean: Mean feature values used in imputation (i.e. to fill in missing features) """ y, x, x_mean = get_pairwise_features(database_train, labels_train, pair_seed) print 'Training decision tree pairwise match function...' print ' Pos/Neg training sample class split: ', sum(y), '/', len(y) - sum(y) self._classifier.fit(x, y) print 'Match function training complete.' return x_mean
def _train(self, database_train, labels_train, pair_seed): """ Gets training samples and trains the surrogate match function :param database_train: Training database :param labels_train: Dictionary of [identier, cluster label] :param pair_seed: List of pairs to use :return x_mean: Mean feature values used in imputation (i.e. to fill in missing features) """ y, x, _ = get_pairwise_features(database_train, labels_train, pair_seed, impute=False) print 'Training decision forest pairwise match function...' print ' Pos/Neg training sample class split: ', sum(y), '/', len(y) - sum(y) # Write features to text file np.savetxt('features.csv', x, delimiter=",", header=) # Run executable # save path to model as the classifier self._classifier.fit(x, y) print 'Match function training complete.'
def _train(self, database_train, labels_train, pair_seed): """ Gets training samples and trains the surrogate match function :param database_train: Training database :param labels_train: Dictionary of [identier, cluster label] :param pair_seed: List of pairs to use :return x2_mean: Mean feature values used in imputation (i.e. to fill in missing features) """ x1_train, x2_train, x2_mean = get_pairwise_features(database_train, labels_train, pair_seed) bounds = list() for _ in range(x2_train.shape[1]): bounds.append((None, 0)) # restrict all feature weights to be negative (convex set) bounds.append((None, None)) # no bounds for the constant offset term print 'Training Logistic Regression pairwise match function...' print ' Pos/Neg training sample class split: ', sum(x1_train), '/', len(x1_train) - sum(x1_train) print ' Enforcing weight vector bounds:' for counter, bound in enumerate(bounds): print ' Feature', counter, ': ', bounds print ' (last term is intercept)' self._logreg.bounds = bounds self._logreg.fit(x2_train, x1_train) print 'Model coefficients: ', self._logreg.coef_ print 'Intercept: ', self._logreg.intercept_ return x2_mean