コード例 #1
0
 def test_pairs(self):
     database = Database('test_annotations_10000_cleaned.csv', header_path='test_annotations_10000_cleaned_header.csv')
     labels = fast_strong_cluster(database)
     pair_seed = generate_pair_seed(database, labels, 0.5)
     # x1_a, x2_a, m_a = _get_pairs(database, labels, 10, balancing=True)
     # x1_b, x2_b, m_b = _get_pairs(database, labels, 10, balancing=True)
     # self.assertNotEqual(x1_a, x1_b)
     # self.assertNotEqual(x2_a, x2_b)
     # self.assertNotEqual(m_a, m_b)
     x1_a, x2_a, m_a = get_pairwise_features(database, labels, pair_seed)
     x1_b, x2_b, m_b = get_pairwise_features(database, labels, pair_seed)
     np.testing.assert_array_equal(x1_a, x1_b)
     np.testing.assert_array_equal(x2_a, x2_b)
     np.testing.assert_array_equal(m_a, m_b)
コード例 #2
0
 def test(self, database_test, labels_test, pair_seed):
     """
     Get testing samples and test the surrogate match function. Evaluated with ROC curve
     :param database_test: RecordDatabase object
     :param labels_test: Corresponding labels, of dict form [record_id, label]
     :param pair_seed: List of pairs, where each pair is a tuple of form (identifierA, identifierB)
     :return RocCurve: An RocCurve object
     """
     y_test, x_test, _ = get_pairwise_features(database_test, labels_test,
                                               pair_seed)
     self.class_balance_validation = float(sum(y_test)) / len(y_test)
     x_bar_probability = self._classifier.predict_proba(x_test)[:, 1]
     #output = np.column_stack((x1_test, x1_bar_probability))
     #np.savetxt('roc_labels.csv', output, delimiter=",", header='label,probability', fmt='%.1i,%5.5f')
     roc = RocCurve(y_test, x_bar_probability)
     self.roc = roc
     sorted_indices = np.argsort(-1 * x_bar_probability)
     for sorted_index in sorted_indices:
         x1_bar = x_bar_probability[sorted_index]
         pair = pair_seed[sorted_index]
         print 'Test pair P(match) = ', x1_bar
         database_test.records[pair[0]].display(indent='     ')
         print '     ----'
         database_test.records[pair[1]].display(indent='     ')
     return roc
コード例 #3
0
    def test(self, database_test, labels_test, pair_seed):
        """
        Get testing samples and test the surrogate match function. Evaluated with ROC curve
        :param database_test: RecordDatabase object
        :param labels_test: Corresponding labels, of dict form [record_id, label]
        :param pair_seed: List of pairs, where each pair is a tuple of form (identifierA, identifierB)
        :return RocCurve: An RocCurve object
        """
        y_test, x_test, _ = get_pairwise_features(database_test, labels_test, pair_seed)
        self.class_balance_validation = float(sum(y_test))/len(y_test)
        # Save to text file
        # Run executable
        # Load csv file

        x_bar_probability = self._classifier.predict_proba(x_test)[:, 1]
        #output = np.column_stack((x1_test, x1_bar_probability))
        #np.savetxt('roc_labels.csv', output, delimiter=",", header='label,probability', fmt='%.1i,%5.5f')
        roc = RocCurve(y_test, x_bar_probability)
        self.roc = roc
        sorted_indices = np.argsort(-1*x_bar_probability)
        for sorted_index in sorted_indices:
            x1_bar = x_bar_probability[sorted_index]
            pair = pair_seed[sorted_index]
            print 'Test pair P(match) = ', x1_bar
            database_test.records[pair[0]].display(indent='     ')
            print '     ----'
            database_test.records[pair[1]].display(indent='     ')
        return roc
コード例 #4
0
 def _train(self, database_train, labels_train, pair_seed):
     """
     Gets training samples and trains the surrogate match function
     :param database_train: Training database
     :param labels_train: Dictionary of [identier, cluster label]
     :param pair_seed: List of pairs to use
     :return x2_mean: Mean feature values used in imputation (i.e. to fill in missing features)
     """
     x1_train, x2_train, x2_mean = get_pairwise_features(
         database_train, labels_train, pair_seed)
     bounds = list()
     for _ in range(x2_train.shape[1]):
         bounds.append((
             None,
             0))  # restrict all feature weights to be negative (convex set)
     bounds.append((None, None))  # no bounds for the constant offset term
     print 'Training Logistic Regression pairwise match function...'
     print '     Pos/Neg training sample class split: ', sum(
         x1_train), '/', len(x1_train) - sum(x1_train)
     print '     Enforcing weight vector bounds:'
     for counter, bound in enumerate(bounds):
         print '         Feature', counter, ': ', bounds
     print '         (last term is intercept)'
     self._logreg.bounds = bounds
     self._logreg.fit(x2_train, x1_train)
     print 'Model coefficients: ', self._logreg.coef_
     print 'Intercept: ', self._logreg.intercept_
     return x2_mean
コード例 #5
0
 def _train(self, database_train, labels_train, pair_seed):
     """
     Gets training samples and trains the surrogate match function
     :param database_train: Training database
     :param labels_train: Dictionary of [identier, cluster label]
     :param pair_seed: List of pairs to use
     :return x_mean: Mean feature values used in imputation (i.e. to fill in missing features)
     """
     y, x, x_mean = get_pairwise_features(database_train, labels_train, pair_seed)
     print 'Training decision tree pairwise match function...'
     print '     Pos/Neg training sample class split: ', sum(y), '/', len(y) - sum(y)
     self._classifier.fit(x, y)
     print 'Match function training complete.'
     return x_mean
コード例 #6
0
 def _train(self, database_train, labels_train, pair_seed):
     """
     Gets training samples and trains the surrogate match function
     :param database_train: Training database
     :param labels_train: Dictionary of [identier, cluster label]
     :param pair_seed: List of pairs to use
     :return x_mean: Mean feature values used in imputation (i.e. to fill in missing features)
     """
     y, x, x_mean = get_pairwise_features(database_train, labels_train, pair_seed)
     print 'Training decision tree pairwise match function...'
     print '     Pos/Neg training sample class split: ', sum(y), '/', len(y) - sum(y)
     self._classifier.fit(x, y)
     print 'Match function training complete.'
     return x_mean
コード例 #7
0
    def _train(self, database_train, labels_train, pair_seed):
        """
        Gets training samples and trains the surrogate match function
        :param database_train: Training database
        :param labels_train: Dictionary of [identier, cluster label]
        :param pair_seed: List of pairs to use
        :return x_mean: Mean feature values used in imputation (i.e. to fill in missing features)
        """
        y, x, _ = get_pairwise_features(database_train, labels_train, pair_seed, impute=False)
        print 'Training decision forest pairwise match function...'
        print '     Pos/Neg training sample class split: ', sum(y), '/', len(y) - sum(y)
        # Write features to text file
        np.savetxt('features.csv', x, delimiter=",", header=)
        # Run executable
        # save path to model as the classifier

        self._classifier.fit(x, y)
        print 'Match function training complete.'
コード例 #8
0
 def _train(self, database_train, labels_train, pair_seed):
     """
     Gets training samples and trains the surrogate match function
     :param database_train: Training database
     :param labels_train: Dictionary of [identier, cluster label]
     :param pair_seed: List of pairs to use
     :return x2_mean: Mean feature values used in imputation (i.e. to fill in missing features)
     """
     x1_train, x2_train, x2_mean = get_pairwise_features(database_train, labels_train, pair_seed)
     bounds = list()
     for _ in range(x2_train.shape[1]):
         bounds.append((None, 0))  # restrict all feature weights to be negative (convex set)
     bounds.append((None, None))  # no bounds for the constant offset term
     print 'Training Logistic Regression pairwise match function...'
     print '     Pos/Neg training sample class split: ', sum(x1_train), '/', len(x1_train) - sum(x1_train)
     print '     Enforcing weight vector bounds:'
     for counter, bound in enumerate(bounds):
         print '         Feature', counter, ': ', bounds
     print '         (last term is intercept)'
     self._logreg.bounds = bounds
     self._logreg.fit(x2_train, x1_train)
     print 'Model coefficients: ', self._logreg.coef_
     print 'Intercept: ', self._logreg.intercept_
     return x2_mean