Python check_attrs_present Examples, magellan.utils.catalog_helper.check_attrs_present Python Examples

Example #1

0

Show file

File: mlmatcherselection.py Project: paulgc/magellan

def get_xy_data_ex(table, exclude_attrs, target_attr):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type dataframe')
        raise AssertionError(logger.error('Input table is not of type dataframe'))

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
        raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError('The target_attr is not present in the input table')

    exclude_attrs = list_drop_duplicates(exclude_attrs)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)
    # table = table.to_dataframe()
    x = table[attrs_to_project].values
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    return x, y

Example #2

0

Show file

File: mlmatcher.py Project: paulgc/magellan

    def fit_ex_attrs(self, table, exclude_attrs, target_attr):
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type dataframe')
            raise AssertionError('Input table is not of type dataframe')

        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        if not check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')
        if not check_attrs_present(table, target_attr):
            logger.error('The target_attr is not present in the input table')
            raise AssertionError('The target_attr is not present in the input table')

        exclude_attrs = list_drop_duplicates(exclude_attrs)

        if target_attr not in exclude_attrs:
            exclude_attrs.append(target_attr)
        attrs_to_project = list_diff(list(table.columns), exclude_attrs)
        x = table[attrs_to_project]
        y = table[target_attr]
        self.fit_sklearn(x, y, check_rem=False)

Example #3

0

Show file

    def fit_ex_attrs(self, table, exclude_attrs, target_attr):
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type dataframe')
            raise AssertionError('Input table is not of type dataframe')

        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        if not check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')
        if not check_attrs_present(table, target_attr):
            logger.error('The target_attr is not present in the input table')
            raise AssertionError(
                'The target_attr is not present in the input table')

        exclude_attrs = list_drop_duplicates(exclude_attrs)

        if target_attr not in exclude_attrs:
            exclude_attrs.append(target_attr)
        attrs_to_project = list_diff(list(table.columns), exclude_attrs)
        x = table[attrs_to_project]
        y = table[target_attr]
        self.fit_sklearn(x, y, check_rem=False)

Example #4

0

Show file

def get_xy_data_ex(table, exclude_attrs, target_attr):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input table is not of type dataframe')
        raise AssertionError(
            logger.error('Input table is not of type dataframe'))

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]
    if not check_attrs_present(table, exclude_attrs):
        logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
        raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')
    if not check_attrs_present(table, target_attr):
        logger.error('The target_attr is not present in the input table')
        raise AssertionError(
            'The target_attr is not present in the input table')

    exclude_attrs = list_drop_duplicates(exclude_attrs)
    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)
    attrs_to_project = list_diff(list(table.columns), exclude_attrs)
    # table = table.to_dataframe()
    x = table[attrs_to_project].values
    y = table[target_attr].values
    y = y.ravel()  # to mute warnings from svm and cross validation
    return x, y

Example #5

0

Show file

File: debug_gui_decisiontree_matcher.py Project: paulgc/magellan

def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True):
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError('Input matcher is not of type Decision Tree matcher')

    if not isinstance(target_attr, six.string_types):
        logger.error('Target attribute is not of type string')
        raise AssertionError('Target attribute is not of type string')

    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError('The exclude attrs are not in the train table columns')

    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError('The target attr is not in the train table columns')

    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError('The exclude attrs are not in the test table columns')

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    exclude_attrs = list_drop_duplicates(exclude_attrs)

    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # fit using training data
    matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr)

    predict_attr_name = get_name_for_predict_column(test.columns)

    # predict using the test data
    predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name, append=True,
                                inplace=False)

    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)

    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])

    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe,
                          fn_dataframe)
    if show_window == True:
        m.show()
        app.exec_()

Example #6

0

Show file

File: catalog_manager.py Project: paulgc/magellan

def validate_metadata_for_table(table, key, out_str, lgr, verbose):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not key in table.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    ch.log_info(lgr, 'Validating ' + out_str + ' key: ' + str(key), verbose)
    assert isinstance(key, six.string_types) is True, 'Key attribute must be a string.'
    assert ch.check_attrs_present(table,
                                  key) is True, 'Key attribute is not present in the ' + out_str + ' table'
    assert ch.is_key_attribute(table, key, verbose) == True, 'Attribute ' + str(key) + \
                                                             ' in the ' + out_str + ' table ' \
                                                                                    'does not qualify to be the key'
    ch.log_info(lgr, '..... Done', verbose)
    return True

Example #7

0

Show file

File: mlmatcher.py Project: paulgc/magellan

    def predict_ex_attrs(self, table, exclude_attrs):
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type dataframe')
            raise AssertionError('Input table is not of type dataframe')

        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        if not check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')

        attrs_to_project = list_diff(list(table.columns), exclude_attrs)
        x = table[attrs_to_project]
        y = self.predict_sklearn(x, check_rem=False)
        return y

Example #8

0

Show file

    def predict_ex_attrs(self, table, exclude_attrs):
        if not isinstance(table, pd.DataFrame):
            logger.error('Input table is not of type dataframe')
            raise AssertionError('Input table is not of type dataframe')

        if not isinstance(exclude_attrs, list):
            exclude_attrs = [exclude_attrs]

        if not check_attrs_present(table, exclude_attrs):
            logger.error('The attributes mentioned in exclude_attrs is not present ' \
                     'in the input table')
            raise AssertionError('The attributes mentioned in exclude_attrs is not present ' \
                             'in the input table')

        attrs_to_project = list_diff(list(table.columns), exclude_attrs)
        x = table[attrs_to_project]
        y = self.predict_sklearn(x, check_rem=False)
        return y

Example #9

0

Show file

def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error(
            'The label column name (%s) is already present in the '
            'input table', label_column_name)
        raise AssertionError(
            'The label column name (%s) is already present '
            'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(
        logger, 'Required metadata: cand.set key, fk ltable, '
        'fk rtable, ltable, rtable, ltable key, rtable key', verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable,
                                     rtable, l_key, r_key, logger, verbose)

    # Return True if everything was successful
    return True

Example #10

0

Show file

File: labeler.py Project: paulgc/magellan

def _validate_inputs(table, label_column_name, verbose):
    """
    This function validates the inputs for the label_table function
    """
    # Validate the input parameters

    # # The input table table is expected to be of type pandas DataFrame
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type data frame')
        raise AssertionError('Input object is not of type data frame')

    # # The label column name is expected to be of type string
    if not isinstance(label_column_name, six.string_types):
        logger.error('Input attr. is not of type string')
        raise AssertionError('Input attr. is not of type string')

    # # Check if the label column name is already present in the input table
    if ch.check_attrs_present(table, label_column_name):
        logger.error('The label column name (%s) is already present in the '
                     'input table', label_column_name)
        raise AssertionError('The label column name (%s) is already present '
                             'in the input table', label_column_name)

    # Now, validate the metadata for the input DataFrame as we have to copy
    # these properties to the output DataFrame

    # # First, display what metadata is required for this function
    ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, '
                        'fk rtable, ltable, rtable, ltable key, rtable key',
                verbose)

    # # Second, get the metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \
        cm.get_metadata_for_candset(table, logger, verbose)

    # # Third, validate the metadata
    cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key,
                                     logger, verbose)

    # Return True if everything was successful
    return True

Example #11

0

Show file

def validate_metadata_for_table(table, key, out_str, lgr, verbose):
    if not isinstance(table, pd.DataFrame):
        logger.error('Input object is not of type pandas data frame')
        raise AssertionError('Input object is not of type pandas data frame')

    if not key in table.columns:
        logger.error('Input key ( %s ) not in the dataframe' % key)
        raise KeyError('Input key ( %s ) not in the dataframe' % key)

    ch.log_info(lgr, 'Validating ' + out_str + ' key: ' + str(key), verbose)
    assert isinstance(
        key, six.string_types) is True, 'Key attribute must be a string.'
    assert ch.check_attrs_present(
        table, key
    ) is True, 'Key attribute is not present in the ' + out_str + ' table'
    assert ch.is_key_attribute(table, key, verbose) == True, 'Attribute ' + str(key) + \
                                                             ' in the ' + out_str + ' table ' \
                                                                                    'does not qualify to be the key'
    ch.log_info(lgr, '..... Done', verbose)
    return True

Example #12

0

Show file

File: extractfeatures.py Project: paulgc/magellan

def extract_feature_vecs(candset,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None,
                         verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')

    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')

    log_info(
        logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
        'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(
        candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable,
                                     ltable, rtable, l_key, r_key, logger,
                                     verbose)

    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)

    # # apply feature functions
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table

Example #13

0

Show file

File: extractfeatures.py Project: paulgc/magellan

def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=True):

    if not isinstance(candset, pd.DataFrame):
        logger.error('Input cand.set is not of type dataframe')
        raise AssertionError('Input cand.set is not of type dataframe')

    # validate input parameters
    if attrs_before != None:
        if not check_attrs_present(candset, attrs_before):
            logger.error('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_before is not present ' \
                                                           'in the input table')

    if attrs_after != None:
        if not check_attrs_present(candset, attrs_after):
            logger.error('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')
            raise AssertionError('The attributes mentioned in attrs_after is not present ' \
                                                           'in the input table')


    if feature_table is None:
        logger.error('Feature table cannot be null')
        raise AssertionError('The feature table cannot be null')


    log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, '
                     'ltable, rtable, ltable key, rtable key', verbose)

    # # get metadata
    key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose)

    # # validate metadata
    cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key,
                                     logger, verbose)


    # extract features
    id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()]

    # # set index for convenience
    l_df = ltable.set_index(l_key, drop=False)
    r_df = rtable.set_index(r_key, drop=False)


    # # apply feature functions
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]

    # construct output table
    table = pd.DataFrame(feat_vals)

    # # rearrange the feature names in the given order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]

    # # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable])
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, candset[a])

    # # insert keys
    table.insert(0, fk_rtable, candset[fk_rtable])
    table.insert(0, fk_ltable, candset[fk_ltable])
    table.insert(0, key, candset[key])

    # # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable])
        attrs_after.reverse()
        col_pos = len(table.columns)
        for a in attrs_after:
            table.insert(col_pos, a, candset[a])
            col_pos += 1

    # reset the index
    table.reset_index(inplace=True, drop=True)

    # # update the catalog
    cm.init_properties(table)
    cm.copy_properties(candset, table)

    return table

Example #14

0

Show file

 def test_check_attrs_present_valid_1(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, 'ID')
     self.assertEqual(status, True)

Example #15

0

Show file

def eval_matches(X, gold_label_attr, predicted_label_attr):
    if not isinstance(X, pd.DataFrame):
        logger.error('The input table is not of type dataframe')
        raise AssertionError('The input is not of type dataframe')

    if not isinstance(gold_label_attr, six.string_types):
        logger.error('The input gold_label_attr is not of type string')
        raise AssertionError('The input gold_label_attr is not of type string')

    if not isinstance(predicted_label_attr, six.string_types):
        logger.error('The input predicted_label_attr is not of type string')
        raise AssertionError(
            'The input predicted_label_attr is not of type string')

    if not check_attrs_present(X, gold_label_attr):
        logger.error('The gold_label_attr is not present in the input table')
        raise AssertionError(
            'The gold_label_attr is not present in the input table')

    if not check_attrs_present(X, predicted_label_attr):
        logger.error(
            'The predicted_label_attr is not present in the input table')
        raise AssertionError(
            'The predicted_label_attr is not present in the input table')

    Y = X.reset_index(drop=False, inplace=False)
    g = Y[gold_label_attr]
    # if isinstance(g, pd.DataFrame):
    #     g = g.T
    #     assert len(g) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     g = g.iloc[0]

    p = Y[predicted_label_attr]
    # if isinstance(p, pd.DataFrame):
    #     p = p.T
    #     assert len(p) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     p = p.iloc[0]

    # get false label (0) indices
    gf = g[g == 0].index.values

    pf = p[p == 0].index.values

    # get true label (1) indices
    gt = g[g == 1].index.values

    pt = p[p == 1].index.values

    # get false positive indices
    fp_indices = list(set(gf).intersection(pt))

    # get true positive indices
    tp_indices = list(set(gt).intersection(pt))

    # get false negative indices
    fn_indices = list(set(gt).intersection(pf))

    # get true negative indices
    tn_indices = list(set(gf).intersection(pf))

    n_tp = float(len(tp_indices))
    n_fp = float(len(fp_indices))
    n_fn = float(len(fn_indices))
    n_tn = float(len(tn_indices))
    prec_num = n_tp
    prec_den = n_tp + n_fp
    rec_num = n_tp
    rec_den = n_tp + n_fn
    if prec_den == 0.0:
        precision = 0.0
    else:
        precision = prec_num / prec_den
    if rec_den == 0.0:
        recall = 0.0
    else:
        recall = rec_num / rec_den

    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (2.0 * precision * recall) / (precision + recall)

    fk_ltable, fk_rtable = cm.get_property(X, 'fk_ltable'), cm.get_property(
        X, 'fk_rtable')

    Y.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)
    false_pos_ls = list(Y.ix[fp_indices].index.values)
    false_neg_ls = list(Y.ix[fn_indices].index.values)
    ret_dict = OrderedDict()
    ret_dict['prec_numerator'] = prec_num
    ret_dict['prec_denominator'] = prec_den
    ret_dict['precision'] = precision
    ret_dict['recall_numerator'] = rec_num
    ret_dict['recall_denominator'] = rec_den
    ret_dict['recall'] = recall
    ret_dict['f1'] = f1
    ret_dict['pred_pos_num'] = n_tp + n_fp
    ret_dict['false_pos_num'] = n_fp
    ret_dict['false_pos_ls'] = false_pos_ls
    ret_dict['pred_neg_num'] = n_fn + n_tn
    ret_dict['false_neg_num'] = n_fn
    ret_dict['false_neg_ls'] = false_neg_ls
    return ret_dict

Example #16

0

Show file

 def test_check_attrs_present_valid_3(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, ['_ID'])
     self.assertEqual(status, False)

Example #17

0

Show file

def _vis_debug_dt(matcher,
                  train,
                  test,
                  exclude_attrs,
                  target_attr,
                  show_window=True):
    if not isinstance(matcher, DTMatcher):
        logger.error('Input matcher is not of type Decision Tree matcher')
        raise AssertionError(
            'Input matcher is not of type Decision Tree matcher')

    if not isinstance(target_attr, six.string_types):
        logger.error('Target attribute is not of type string')
        raise AssertionError('Target attribute is not of type string')

    if not check_attrs_present(train, exclude_attrs):
        logger.error('The exclude attrs are not in train table columns')
        raise AssertionError(
            'The exclude attrs are not in the train table columns')

    if not check_attrs_present(train, target_attr):
        logger.error('The target attr is not in train table columns')
        raise AssertionError(
            'The target attr is not in the train table columns')

    if not check_attrs_present(test, exclude_attrs):
        logger.error('The exclude attrs are not in test table columns')
        raise AssertionError(
            'The exclude attrs are not in the test table columns')

    if not isinstance(exclude_attrs, list):
        exclude_attrs = [exclude_attrs]

    exclude_attrs = list_drop_duplicates(exclude_attrs)

    if target_attr not in exclude_attrs:
        exclude_attrs.append(target_attr)

    # fit using training data
    matcher.fit(table=train,
                exclude_attrs=exclude_attrs,
                target_attr=target_attr)

    predict_attr_name = get_name_for_predict_column(test.columns)

    # predict using the test data
    predicted = matcher.predict(table=test,
                                exclude_attrs=exclude_attrs,
                                target_attr=predict_attr_name,
                                append=True,
                                inplace=False)

    eval_summary = mg.eval_matches(predicted, target_attr, predict_attr_name)

    metric = get_metric(eval_summary)
    fp_dataframe = get_dataframe(predicted, eval_summary['false_pos_ls'])

    fn_dataframe = get_dataframe(predicted, eval_summary['false_neg_ls'])
    app = mg._viewapp
    m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted,
                          fp_dataframe, fn_dataframe)
    if show_window == True:
        m.show()
        app.exec_()

Example #18

0

Show file

 def test_check_attrs_present_invalid_df(self):
     ch.check_attrs_present(None, 'ID')

Example #19

0

Show file

 def test_check_attrs_invalid_None(self):
     A = pd.read_csv(path_a)
     status = ch.check_attrs_present(A, None)
     self.assertEqual(status, False)

Example #20

0

Show file

File: evaluation.py Project: paulgc/magellan

def eval_matches(X, gold_label_attr, predicted_label_attr):
    if not isinstance(X, pd.DataFrame):
        logger.error('The input table is not of type dataframe')
        raise AssertionError('The input is not of type dataframe')

    if not isinstance(gold_label_attr, six.string_types):
        logger.error('The input gold_label_attr is not of type string')
        raise AssertionError('The input gold_label_attr is not of type string')

    if not isinstance(predicted_label_attr, six.string_types):
        logger.error('The input predicted_label_attr is not of type string')
        raise AssertionError('The input predicted_label_attr is not of type string')

    if not check_attrs_present(X, gold_label_attr):
        logger.error('The gold_label_attr is not present in the input table')
        raise AssertionError('The gold_label_attr is not present in the input table')

    if not check_attrs_present(X, predicted_label_attr):
        logger.error('The predicted_label_attr is not present in the input table')
        raise AssertionError('The predicted_label_attr is not present in the input table')

    Y = X.reset_index(drop=False, inplace=False)
    g = Y[gold_label_attr]
    # if isinstance(g, pd.DataFrame):
    #     g = g.T
    #     assert len(g) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     g = g.iloc[0]

    p = Y[predicted_label_attr]
    # if isinstance(p, pd.DataFrame):
    #     p = p.T
    #     assert len(p) == 1, 'Error: Column is picked as dataframe and the num rows > 1'
    #     p = p.iloc[0]

    # get false label (0) indices
    gf = g[g == 0].index.values

    pf = p[p == 0].index.values

    # get true label (1) indices
    gt = g[g == 1].index.values

    pt = p[p == 1].index.values

    # get false positive indices
    fp_indices = list(set(gf).intersection(pt))

    # get true positive indices
    tp_indices = list(set(gt).intersection(pt))

    # get false negative indices
    fn_indices = list(set(gt).intersection(pf))

    # get true negative indices
    tn_indices = list(set(gf).intersection(pf))

    n_tp = float(len(tp_indices))
    n_fp = float(len(fp_indices))
    n_fn = float(len(fn_indices))
    n_tn = float(len(tn_indices))
    prec_num = n_tp
    prec_den = n_tp + n_fp
    rec_num = n_tp
    rec_den = n_tp + n_fn
    if prec_den == 0.0:
        precision = 0.0
    else:
        precision = prec_num / prec_den
    if rec_den == 0.0:
        recall = 0.0
    else:
        recall = rec_num / rec_den

    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (2.0 * precision * recall) / (precision + recall)

    fk_ltable, fk_rtable = cm.get_property(X, 'fk_ltable'), cm.get_property(X, 'fk_rtable')

    Y.set_index([fk_ltable, fk_rtable], drop=False, inplace=True)
    false_pos_ls = list(Y.ix[fp_indices].index.values)
    false_neg_ls = list(Y.ix[fn_indices].index.values)
    ret_dict = OrderedDict()
    ret_dict['prec_numerator'] = prec_num
    ret_dict['prec_denominator'] = prec_den
    ret_dict['precision'] = precision
    ret_dict['recall_numerator'] = rec_num
    ret_dict['recall_denominator'] = rec_den
    ret_dict['recall'] = recall
    ret_dict['f1'] = f1
    ret_dict['pred_pos_num'] = n_tp + n_fp
    ret_dict['false_pos_num'] = n_fp
    ret_dict['false_pos_ls'] = false_pos_ls
    ret_dict['pred_neg_num'] = n_fn + n_tn
    ret_dict['false_neg_num'] = n_fn
    ret_dict['false_neg_ls'] = false_neg_ls
    return ret_dict