def user_labeling(self, candidates):
        df = self.phrases_to_dataframe(candidates)
        df.to_csv(r'temp.csv', index=False)

        # df_csv = em.read_csv_metadata('temp.csv', key='idx')
        # em.label_table(df_csv, 'gold_label')
        df_cc = em._init_label_table(df, 'gold_label')
        # df_cc['gold_label'] = df['label']
        df_cc['gold_label'] = 1

        # Invoke the GUI
        try:
            from PyQt5 import QtGui
        except ImportError:
            raise ImportError(
                'PyQt5 is not installed. Please install PyQt5 to use '
                'GUI related functions in py_entitymatching.')

        from py_entitymatching.gui.table_gui import edit_table
        edit_table(df_cc)

        df_cc = self._post_process_labelled_table(df_cc, 'gold_label')

        print(df_cc.head())

        with open('/home/beidan/AutoPhrase/tmp/labeled_patterns.txt',
                  'w',
                  encoding='utf-8') as f:
            for rec_idx, rec in df_cc.iterrows():
                f.write(rec['idx'] + '\t' + str(rec['gold_label']) + '\n')

        print("successfully write {} records to Autophrase".format(len(df_cc)))
Exemple #2
0
def label_table(table, label_column_name, verbose=False):
    """
    Label a pandas DataFrame (for supervised learning purposes).

    This functions labels a DataFrame, typically used for supervised learning
    purposes. This function expects the input DataFrame containing the metadata
    of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable).
    This function creates a copy of the input DataFrame, adds label column
    at the end of the DataFrame, fills the column values with 0, invokes a
    GUI for the user to enter labels (0/1, 0: non-match, 1: match) and finally
    returns the labeled DataFrame. Further, this function also copies the
    properties from the input DataFrame to the output DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be labeled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        label_column_name (string): The column name to be given for the labels
            entered by the user.
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the labels entered by the user. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If `label_column_name` is not of type string.
        AssertionError: If the `label_column_name` is already present in the
            input table.

    """
    # Validate the input parameters: check input types, check the metadata
    # for the input DataFrame as it will get copied to the labeled DataFrame
    _validate_inputs(table, label_column_name, verbose)

    # Initialize the table to be labeled: create a copy and set the column
    # values to be 0s
    labeled_table = _init_label_table(table, label_column_name)

    # Invoke the GUI
    try:
        from PyQt4 import QtGui
    except ImportError:
        raise ImportError('PyQt4 is not installed. Please install PyQt4 to use '
                      'GUI related functions in py_entitymatching.')

    from py_entitymatching.gui.table_gui import edit_table
    edit_table(labeled_table)

    # Post process the labeled table: validate whether the labels contain
    # only 0/1s, copy the properties (in the catalog) of the input table to the
    # labeled table
    labeled_table = _post_process_labelled_table(table, labeled_table,
                                                 label_column_name)
    # Return the labeled table
    return labeled_table
def label_table(table, label_column_name, verbose=False):
    """
    Label a pandas DataFrame (for supervised learning purposes).

    This functions labels a DataFrame, typically used for supervised learning
    purposes. This function expects the input DataFrame containing the metadata
    of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable).
    This function creates a copy of the input DataFrame, adds label column
    at the end of the DataFrame, fills the column values with 0, invokes a
    GUI for the user to enter labels (0/1, 0: non-match, 1: match) and finally
    returns the labeled DataFrame. Further, this function also copies the
    properties from the input DataFrame to the output DataFrame.

    Args:
        table (DataFrame): The input DataFrame to be labeled.
            Specifically,
            a DataFrame containing the metadata of a candidate set (such as
            key, fk_ltable, fk_rtable, ltable, rtable) in the catalog.
        label_column_name (string): The column name to be given for the labels
            entered by the user.
        verbose (boolean): A flag to indicate whether more detailed information
            about the execution steps should be printed out (default value is
            False).

    Returns:
        A new DataFrame with the labels entered by the user. Further,
        this function sets the output DataFrame's properties same as input
        DataFrame.

    Raises:
        AssertionError: If `table` is not of type pandas DataFrame.
        AssertionError: If `label_column_name` is not of type string.
        AssertionError: If the `label_column_name` is already present in the
            input table.

    Examples:
        >>> import py_entitymatching as em
        >>> G = em.label_table(S, label_column_name='label') # S is the (sampled) table that has to be labeled.

    """
    # Validate the input parameters: check input types, check the metadata
    # for the input DataFrame as it will get copied to the labeled DataFrame
    _validate_inputs(table, label_column_name, verbose)

    # Initialize the table to be labeled: create a copy and set the column
    # values to be 0s
    labeled_table = _init_label_table(table, label_column_name)

    # Invoke the GUI
    try:
        from PyQt5 import QtGui
    except ImportError:
        raise ImportError('PyQt5 is not installed. Please install PyQt5 to use '
                      'GUI related functions in py_entitymatching.')

    from py_entitymatching.gui.table_gui import edit_table
    edit_table(labeled_table)

    # Post process the labeled table: validate whether the labels contain
    # only 0/1s, copy the properties (in the catalog) of the input table to the
    # labeled table
    labeled_table = _post_process_labelled_table(table, labeled_table,
                                                 label_column_name)
    # Return the labeled table
    return labeled_table