def test_get_property_valid_df_name_2(self): # cm.del_catalog() self.assertEqual(cm.get_catalog_len(), 0) A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) self.assertEqual(cm.get_property(C, 'key'), '_id') self.assertEqual(cm.get_property(C, 'fk_ltable'), 'ltable_ID') self.assertEqual(cm.get_property(C, 'fk_rtable'), 'rtable_ID') self.assertEqual(cm.get_property(C, 'ltable').equals(A), True) self.assertEqual(cm.get_property(C, 'rtable').equals(B), True)
def test_set_property_valid_df_name_value(self): # cm.del_catalog() df = pd.read_csv(path_a) cm.set_property(df, 'key', 'ID') self.assertEqual(cm.get_property(df, 'key'), 'ID')
def test_get_property_df_notin_catalog(self): # cm.del_catalog() A = pd.read_csv(path_a) cm.get_property(A, 'key')
def test_get_property_invalid_path_1(self): # cm.del_catalog() A = read_csv_metadata(path_a) cm.get_property(A, None)
def test_get_property_invalid_df_1(self): cm.get_property(10, 'key')
def test_get_fk_rtable_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) self.assertEqual(cm.get_fk_rtable(C), cm.get_property(C, 'fk_rtable')) self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
def test_get_property_valid_df_name_1(self): # cm.del_catalog() df = read_csv_metadata(path_a) self.assertEqual(cm.get_property(df, 'key'), 'ID')
def eval_matches(data_frame, gold_label_attr, predicted_label_attr): """ Evaluates the matches from the matcher. Specifically, given a DataFrame containing golden labels and predicted labels, this function would evaluate the matches and return the accuracy results such as precision, recall and F1. Args: data_frame (DataFrame): The input pandas DataFrame containing "gold" labels and "predicted" labels. gold_label_attr (string): An attribute in the input DataFrame containing "gold" labels. predicted_label_attr (string): An attribute in the input DataFrame containing "predicted" labels. Returns: A Python dictionary containing the accuracy measures such as precision, recall, F1. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `gold_label_attr` is not of type string. AssertionError: If `predicted_label_attr` is not of type string. AssertionError: If the `gold_label_attr` is not in the input dataFrame. AssertionError: If the `predicted_label_attr` is not in the input dataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') """ # Validate input parameters # # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame, 'The input table') # # We expect the input attribute (gold_label_attr) to be of type string validate_object_type(gold_label_attr, six.string_types, 'The input gold_label_attr') # # We expect the input attribute (predicted_label_attr) to be of type # string validate_object_type(predicted_label_attr, six.string_types, 'The input predicted_label_attr') # Check whether the gold label attribute is present in the input table if not ch.check_attrs_present(data_frame, gold_label_attr): logger.error( 'The gold_label_attr is not present in the input DataFrame') raise AssertionError( 'The gold_label_attr is not present in the input DataFrame') # Check whether the predicted label attribute is present in the input table if not ch.check_attrs_present(data_frame, predicted_label_attr): logger.error( 'The predicted_label_attr is not present in the input DataFrame') raise AssertionError( 'The predicted_label_attr is not present in the input DataFrame') # Reset the index to get the indices set as 0..len(table) new_data_frame = data_frame.reset_index(drop=False, inplace=False) # Project out the gold and label attributes. gold = new_data_frame[gold_label_attr] predicted = new_data_frame[predicted_label_attr] # Get gold negatives, positives gold_negative = gold[gold == 0].index.values gold_positive = gold[gold == 1].index.values # Get predicted negatives, positives predicted_negative = predicted[predicted == 0].index.values predicted_positive = predicted[predicted == 1].index.values # get false positive indices false_positive_indices = \ list(set(gold_negative).intersection(predicted_positive)) # get true positive indices true_positive_indices = \ list(set(gold_positive).intersection(predicted_positive)) # get false negative indices false_negative_indices = \ list(set(gold_positive).intersection(predicted_negative)) # get true negative indices true_negative_indices = \ list(set(gold_negative).intersection(predicted_negative)) # Get the number of TP, FP, FN, TN num_true_positives = float(len(true_positive_indices)) num_false_positives = float(len(false_positive_indices)) num_false_negatives = float(len(false_negative_indices)) num_true_negatives = float(len(true_negative_indices)) # Precision = num_tp/ (num_tp + num_fp) # Get precision numerator, denominator precision_numerator = num_true_positives precision_denominiator = num_true_positives + num_false_positives # Precision = num_tp/ (num_tp + num_fn) # Get recall numerator, denominator recall_numerator = num_true_positives recall_denominator = num_true_positives + num_false_negatives # Compute precision if precision_denominiator == 0.0: precision = 0.0 else: precision = precision_numerator / precision_denominiator # Compute recall if recall_denominator == 0.0: recall = 0.0 else: recall = recall_numerator / recall_denominator # Compute F1 if precision == 0.0 and recall == 0.0: F1 = 0.0 else: F1 = (2.0 * precision * recall) / (precision + recall) # Get the fk_ltable and fk_rtable fk_ltable = cm.get_property(data_frame, 'fk_ltable') fk_rtable = cm.get_property(data_frame, 'fk_rtable') # Check if the fk_ltable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_ltable): logger.error('The fk_ltable (%s) contains missing values' % fk_ltable) raise AssertionError('The fk_ltable (%s) contains missing values' % fk_ltable) # Check if the fk_rtable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_rtable): logger.error('The fk_rtable (%s) contains missing values' % fk_rtable) raise AssertionError('The fk_rtable (%s) contains missing values' % fk_rtable) # Set the index values to fk_ltable and fk_rtable new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True) # Get the list of false positives and false negatives. false_pos_ls = list( new_data_frame.iloc[false_positive_indices].index.values) false_neg_ls = list( new_data_frame.iloc[false_negative_indices].index.values) # Store and return the accuracy results. accuracy_results = collections.OrderedDict() accuracy_results['prec_numerator'] = precision_numerator accuracy_results['prec_denominator'] = precision_denominiator accuracy_results['precision'] = precision accuracy_results['recall_numerator'] = recall_numerator accuracy_results['recall_denominator'] = recall_denominator accuracy_results['recall'] = recall accuracy_results['f1'] = F1 accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives accuracy_results['false_pos_num'] = num_false_positives accuracy_results['false_pos_ls'] = false_pos_ls accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives accuracy_results['false_neg_num'] = num_false_negatives accuracy_results['false_neg_ls'] = false_neg_ls return accuracy_results
def test_valid_path_wi_metadata_unknownprop(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv']) IM = read_csv_metadata(p) self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
def test_valid_path_wi_metadata_unknownprop(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv']) IM = read_csv_metadata(p) self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.get_property(IM, 'key1'), 'ID')