def mosaic_features(label_column,
                    num_char,
                    num_tokens,
                    file_path=None,
                    df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    if not (num_char) and not (num_tokens):
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "num_char", "num_tokens"))

    if num_char:
        df['num_char'] = df[label_column].apply(lambda label: len(label)
                                                if not (pd.isna(label)) else 0)

    if num_tokens:
        df['num_tokens'] = df[label_column].apply(
            lambda label: len(label.split()) if not (pd.isna(label)) else 0)

    return df
Exemple #2
0
def generate_reciprocal_rank(score_column, output_column, file_path=None, df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format("file_path", "df"))

    if score_column is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {}'.format('score_column'))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)
        
    df.fillna("", inplace=True)
    df = df.astype(dtype={score_column: "float64"})
    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    final_list = []
    grouped_obj = df.groupby(['row', 'column'])
    for cell in grouped_obj:
        reciprocal_rank = list(1/cell[1][score_column].rank(method='first',ascending=False))
        cell[1][output_column] = reciprocal_rank
        final_list.extend(cell[1].to_dict(orient='records'))
    
    odf = pd.DataFrame(final_list)
    return odf
def predict(features,
            output_column,
            ranking_model,
            min_max_scaler_path,
            file_path=None,
            df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    if not (ranking_model) and not (normalization_factor):
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "ranking_model", "normalization_factor"))

    model = PairwiseNetwork(14)
    model.load_state_dict(torch.load(ranking_model))
    scaler = pickle.load(open(min_max_scaler_path, 'rb'))

    normalize_features = features.split(",")
    df[normalize_features] = df[normalize_features].astype('float64')
    grouped_obj = df.groupby(['column', 'row'])
    new_df_list = []
    pred = []
    for cell in grouped_obj:
        cell[1][normalize_features] = scaler.transform(
            cell[1][normalize_features])
        df_copy = cell[1].copy()
        df_features = df_copy[normalize_features]
        new_df_list.append(df_copy)
        arr = df_features.to_numpy()
        test_inp = []
        for a in arr:
            test_inp.append(a)
        test_tensor = torch.tensor(test_inp).float()
        scores = model.predict(test_tensor)
        pred.extend(torch.squeeze(scores).tolist())
    out_df = pd.concat(new_df_list)
    out_df[output_column] = pred

    return out_df
Exemple #4
0
    def get_exact_matches(self,
                          column,
                          properties="labels,aliases",
                          size=50,
                          file_path=None,
                          df=None):
        """
        retrieves the identifiers of KG entities whose label or aliases match the input values with some edit distance allowed.

        Args:
            column: the column used for retrieving candidates.
            properties: a comma separated names of properties in the KG to search for exact match query: default is labels,aliases
            size: maximum number of candidates to retrieve, default is 50.
            file_path: input file in canonical format
            df: input dataframe in canonical format
        Returns: a dataframe in candidates format

        """
        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)

        return self.utility.create_candidates_df(df, column, size, properties,
                                                 'fuzzy-match')
Exemple #5
0
    def __init__(self,
                 output_column_name,
                 feature_file,
                 feature_name,
                 total_docs,
                 singleton_column,
                 input_file=None,
                 df=None):
        """
        initialize the qnodes_dict as original tfidf required input, it is a dict with
            key: Q node id
            value: list of edges in format "property#node2"
        :param kwargs:
        """
        if df is None and input_file is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "input_file", "df"))

        if input_file is not None:
            self.input_df = pd.read_csv(input_file)
        elif df is not None:
            self.input_df = df
        self.input_df = self.input_df.sort_values(['column', 'row'])
        self.output_col_name = output_column_name
        self.N = float(total_docs)

        self.feature_dict, self.feature_count_dict = self.build_qnode_feature_dict(
            feature_file, feature_name)
        self.feature_idf_dict = self.calculate_idf_features()
        self.singleton_column = singleton_column
Exemple #6
0
def drop_by_score(column, file_path=None, df=None, k=20):
    """
    group the dataframe by column, row and then drop the candidates out of given amount k from highest score to lowest
    score

    Args:
        column: column with ranking score
        file_path: input file path
        df: or input dataframe
        k: top k candidates

    Returns:
        filtered dataframe
    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format('file_path', 'df'))

    if file_path:
        df = pd.read_csv(file_path)

    # replace na to 0.0
    df[column] = df[column].astype(float).fillna(0.0)
    # astype float first to prevent error of "invalid literal for int() with base 10: '0.0'"
    df["column"] = df["column"].astype(float).astype(int)
    df["row"] = df["row"].astype(float).astype(int)

    res = pd.DataFrame()
    for key, gdf in df.groupby(by=['column', 'row']):
        gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).iloc[:k, :]
        res = res.append(gdf)
    return res
Exemple #7
0
def normalize_scores(column='retrieval_score', output_column=None, weights=None, file_path=None, df=None,
                     norm_type=None):
    """
    normalizes the retrieval scores for all the candidate knowledge graph objects for each retrieval method for all
    input cells in a column

    Args:
        column: column name which has the retrieval scores. Default is retrieval_score
        output_column: the output column name where the normalized scores will be stored. Default is input column name
        appended with the suffix _normalized
        weights: a comma separated string of the format <retrieval_method_1:<weight_1>, <retrieval_method_2:<weight_2>
        ,...> specifying the weights for each retrieval method. By default, all retrieval method weights are set to 1.0
        file_path: input file path
        df: or input dataframe

    Returns:

    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format("file_path", "df"))

    if output_column is None:
        output_column = '{}_normalized'.format(column)

    method_weights = {}
    if weights is not None:
        m_ws = weights.split(',')
        for m_w in m_ws:
            _ = m_w.split(':')

            method_weights[_[0]] = float(_[1])

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    df[column] = df[column].map(lambda x: float(x))

    grouped_df = df.groupby(by=['column', 'method'])

    o_df = list()
    if norm_type == 'max_norm':
        for i, gdf in grouped_df:
            max_score = gdf[column].max()
            # TODO find a better way to do this without having to make a copy
            fdf = gdf.copy(deep=True)
            fdf[output_column] = gdf[column].map(lambda x: divide_a_by_b(x, max_score) * method_weights.get(i[1], 1.0))
            o_df.append(fdf)
    elif norm_type == 'zscore':
        for i, gdf in grouped_df:
            mean_score = gdf[column].mean()
            std_score = gdf[column].std()
            # TODO find a better way to do this without having to make a copy
            fdf = gdf.copy(deep=True)
            fdf[output_column] = gdf[column].map(
                lambda x: zscore_normalization(x, mean_score, std_score) * method_weights.get(i[1], 1.0))
            o_df.append(fdf)

    out_df = Utility.sort_by_col_and_row(pd.concat(o_df))
    return out_df
Exemple #8
0
def canonicalize(columns,
                 output_column='label',
                 file_path=None,
                 df=None,
                 file_type='csv',
                 add_context=False,
                 context_column_name="context"):
    """
    translate an input CSV or TSV file to canonical form

    Args:
        columns: the columns in the input file to be linked to KG entities. Multiple columns are specified as a comma separated string.
        output_column: specifies the name of a new column to be added. Default output column name is label
        file_path: input file path
        df: or input dataframe
        file_type: csv or tsv
        add_context: choose whether to add other information or not to canonicalize files
        context_column_name: the column name for the other information
    Returns: a dataframe in canonical form

    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {}or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path,
                         sep=',' if file_type == 'csv' else '\t',
                         dtype=object)

    columns = columns.split(',')
    for column in columns:
        if column not in df.columns:
            raise RequiredColumnMissingException(
                "The input column {} does not exist in given data.".format(
                    column))
    out = list()
    for i, v in df.iterrows():
        for column in columns:
            if add_context:
                remaining_columns = v.keys().tolist()
                remaining_columns.remove(column)
                remaining_values = "|".join(
                    v[remaining_columns].dropna().values.tolist())
                out.append({
                    'column': df.columns.get_loc(column),
                    'row': i,
                    output_column: v[column],
                    context_column_name: remaining_values
                })
            else:
                out.append({
                    'column': df.columns.get_loc(column),
                    'row': i,
                    output_column: v[column]
                })
    return pd.DataFrame(out).sort_values(by=['column', 'row'])
    def get_exact_matches(self,
                          column,
                          lower_case=True,
                          size=50,
                          file_path=None,
                          df=None,
                          auxiliary_fields: List[str] = None,
                          auxiliary_folder: str = None,
                          isa: str = None):
        """
        retrieves the identifiers of KG entities whose label or aliases match the input values exactly.

        Args:
            column: the column used for retrieving candidates.
            properties: a comma separated names of properties in the KG to search for exact match query: default is labels,aliases
            lower_case: case insensitive retrieval, default is case sensitive.
            size: maximum number of candidates to retrieve, default is 50.
            file_path: input file in canonical format
            df: input dataframe in canonical format
        Returns: a dataframe in candidates format

        """
        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)

        extra_musts = None
        if isa:
            extra_musts = {
                "term": {
                    "instance_ofs.keyword_lower": {
                        "value": isa.lower()
                    }
                }
            }

        properties = "all_labels.en"

        return self.utility.create_candidates_df(
            df,
            column,
            size,
            properties,
            'exact-match',
            lower_case=lower_case,
            auxiliary_fields=auxiliary_fields,
            auxiliary_folder=auxiliary_folder,
            auxiliary_file_prefix='exact_matches_',
            extra_musts=extra_musts)
Exemple #10
0
def vote_by_classifier(model_file,
                       input_file=None,
                       df=None,
                       prob_threshold=0.995):
    if input_file is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "input_file", "df"))
    if not model_file:
        raise RequiredInputParameterMissingException(
            'Model path cannot be None')

    if input_file:
        df = pd.read_csv(input_file, dtype=object)
    features_list = [
        'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan',
        'des_cont_jaccard_normalized'
    ]
    for ft in features_list:
        assert ft in df, f'There\'s no {ft} column in the table!'

    with open(model_file, 'rb') as fid:
        model_loaded = pickle.load(fid)

    try:
        prob_threshold = float(prob_threshold)
    except:
        prob_threshold = 0

    # make prediction on target file
    odf = df.copy()

    test_features = df.loc[:, [
        'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan',
        'des_cont_jaccard_normalized'
    ]]

    prob = model_loaded.predict_proba(test_features)

    df['prob_1'] = [p[1] for p in prob]
    odf['vote_by_classifier'] = (df['prob_1'] > prob_threshold).astype(int)
    return odf
Exemple #11
0
def drop_duplicate(column: str, score_col: typing.List[str], keep_method: str = None, file_path: str = None,
                   df: pd.DataFrame = None):
    """
    group the dataframe by column, row and then check if there are duplicate rows on given column,
    remove the duplicated one and only keep the highest score one

    Args:
        column: column with labels
        score_col: column with ranking scores
        keep_method: the method need to keep
        file_path: input file path
        df: or input dataframe
    Returns:
        filtered dataframe
    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format('file_path', 'df'))

    if file_path:
        df = pd.read_csv(file_path)

    for each_col in [column] + score_col:
        if each_col not in df.columns:
            raise RequiredColumnMissingException("Column {} does not exist in given dataframe.".format(each_col))

    # replace na to 0.0
    df[score_col] = df[score_col].astype(float).fillna(0.0)
    # astype float first to prevent error of "invalid literal for int() with base 10: '0.0'"
    df["column"] = df["column"].astype(float).astype(int)
    df["row"] = df["row"].astype(float).astype(int)

    res = []
    for key, gdf in df.groupby(by=['column', 'row']):
        # for those nodes with no candidates, we need to check here
        temp = gdf[column].unique()
        if len(temp) == 1 and not isinstance(temp[0], str) and np.isnan(temp[0]):
            res.append(gdf.iloc[0].to_dict())
            continue

        for candidate_id, candidate_df in gdf.groupby(by=[column]):
            if len(candidate_df) > 1:
                # only do keep method when the method specified exists
                if keep_method is not None and keep_method in candidate_df["method"].unique():
                    candidate_df = candidate_df[candidate_df["method"] == keep_method]
                if score_col and len(candidate_df) > 1:
                    candidate_df = candidate_df.sort_values(by=score_col, ascending=[False]).iloc[:1, :]
            res.append(candidate_df.iloc[0].to_dict())

    # sometimes the column order may changed, resort it to ensure follow original order
    res = pd.DataFrame(res)
    res = res.reindex(columns=df.columns)
    return res
    def get_matches(self,
                    column,
                    size=100,
                    file_path=None,
                    df=None,
                    auxiliary_fields: List[str] = None,
                    auxiliary_folder: str = None,
                    isa: str = None):
        """
        Used the ElasticSearch which has the labels, aliases, wikipedia/wikitable anchor text, redirect text
        :param column: the column used to retrieve the candidates
        :param size: the size of the candidates that need to retrieved by the two queries
        :param file_path: input file in canonical format
        :param df: input dataframe in canonical format
        :param output_column_name: the output column name where the retrieval scores are stored
        :return: a dataframe in candidates format
        """
        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)
        properties = self.properties

        extra_musts = None
        if isa:
            extra_musts = {
                "term": {
                    "instance_ofs.keyword_lower": {
                        "value": isa.lower()
                    }
                }
            }
        return self.utility.create_candidates_df(
            df,
            column,
            size,
            properties,
            'fuzzy-augmented',
            lower_case=False,
            auxiliary_fields=auxiliary_fields,
            auxiliary_folder=auxiliary_folder,
            auxiliary_file_prefix='fuzzy_augmented_',
            extra_musts=extra_musts)
Exemple #13
0
def extract_ground_truth(target_column,
                         kg_id_column,
                         kg_label_column,
                         file_path=None,
                         df=None,
                         file_type='csv'):
    """
    Returns ground truth dataframe by extracting columns from input dataframe

    Args:
        target_column: the column in the input file to be linked to KG entities
        kg_id_column: the column in the input file containing the kg identifier
        kg_label_column: the column in the input file containing the kg label
        file_path: input file path
        df: or input dataframe
        file_type: csv or tsv
    Returns: ground truth dataframe in canonical format
    """

    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {}or {}'.format(
                "file_path", "df"))

    for column in [target_column, kg_id_column, kg_label_column]:
        if column not in df.columns:
            raise RequiredColumnMissingException(
                "The input column {} does not exist in given data.".format(
                    column))

    if file_path:
        df = pd.read_csv(file_path,
                         sep=',' if file_type == 'csv' else '\t',
                         dtype=object)

    target_column_index = df.columns.get_loc(target_column)
    out = list()
    for i, v in df.iterrows():
        out.append({
            'column': target_column_index,
            'row': i,
            'kg_id': v[kg_id_column],
            'kg_label': v[kg_label_column]
        })
    return pd.DataFrame(out).sort_values(by=['column', 'row'])
Exemple #14
0
def ground_truth_labeler(gt_file_path, file_path=None, df=None):
    """
    compares each candidate for the input cells with the ground truth value for that cell and adds an evaluation label.

    Args:
        gt_file_path: ground truth file path.
        column: column name with ranking scores
        file_path: input file path
        df: or input dataframe

    Returns: a dataframe with added column `evaluation_label`

    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                'file_path', 'df'))

    gt_df = read_csv(gt_file_path, dtype=object)
    gt_df.rename(columns={
        'kg_id': 'GT_kg_id',
        'kg_label': 'GT_kg_label'
    },
                 inplace=True)

    if file_path:
        df = read_csv(file_path, dtype=object)
    df.fillna('', inplace=True)

    # kyao: Use only columns defined ground truth file format
    evaluation_df = pd.merge(
        df,
        gt_df.loc[:, ['column', 'row', 'GT_kg_id', 'GT_kg_label']],
        on=['column', 'row'],
        how='left')

    evaluation_df['GT_kg_id'].fillna(value="", inplace=True)
    evaluation_df['GT_kg_label'].fillna(value="", inplace=True)

    evaluation_df['evaluation_label'] = evaluation_df.apply(
        lambda row: assign_evaluation_label(row), axis=1)

    # evaluation_df.drop(columns=['max_score'], inplace=True)
    return evaluation_df
Exemple #15
0
def align_page_rank(input_file=None, df=None):
    if input_file is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "input_file", "df"))
    if input_file:
        df = pd.read_csv(input_file, dtype=object)
    assert 'pagerank' in df, 'There\'s no page rank column in the table!'

    odf = pd.DataFrame()
    for ((col, row), group) in df.groupby(['column', 'row']):
        exact_match_df = group[group['method'] == 'exact-match'].copy()
        exact_match_df['aligned_pagerank'] = exact_match_df['pagerank'].astype(
            float)
        odf = odf.append(exact_match_df)

        fuzzy_match_df = group[group['method'] == 'fuzzy-augmented'].copy()
        fuzzy_match_df['aligned_pagerank'] = 0
        odf = odf.append(fuzzy_match_df)
    return odf
Exemple #16
0
    def get_phrase_matches(self, column, properties="labels^2,aliases", size=50, file_path=None, df=None, filter_condition=None):
        """
        retrieves the identifiers of KG entities base on phrase match queries.

        Args:
            column: the column used for retrieving candidates.
            properties: a comma separated names of properties in the KG to search for exact match query: default is labels^2,aliases
            size: maximum number of candidates to retrieve, default is 50.
            file_path: input file in canonical format
            df: input dataframe in canonical format
            filter_condition: a string indicate the filter requirement
        Returns: a dataframe in candidates format

        """
        need_filter = False
        if filter_condition is not None:
            need_filter = True

        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format("file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)

        if need_filter:
            query_input_df = Filter.remove_previous_match_res(df)
        else:
            query_input_df = df

        from tl.utility.utility import Utility

        output_df = self.utility.create_candidates_df(query_input_df, column, size, properties, 'phrase-match')
        Utility.eprint(output_df)
        if need_filter:
            output_df = Filter.combine_result(df, output_df, filter_condition)

        return output_df
Exemple #17
0
def clean(column,
          output_column=None,
          file_path=None,
          df=None,
          symbols='!@#$%^&*()+={}[]:;’\”/<>',
          replace_by_space=True,
          keep_original=False):
    """
    cleans the cell values in a column, creating a new column with the clean values.

    Args:
        column: the column to be cleaned.
        output_column: the name of the column where cleaned column values are stored. If not provided, the name of the
        new column is the name of the input column with the suffix _clean.
        file_path: input file path
        df: or input dataframe
        symbols: a string containing the set of characters to be removed: default is “!@#$%^&*()+={}[]:;’\”/<>”
        replace_by_space: when True (default) all instances of the symbols are replaced by a space. In case of removal
        of multiple consecutive characters, they’ll be replaced by a single space. The value False causes the symbols to be deleted.
        keep_original: when True, the output column will contain the original value and the clean value will be
        appended, separated by |. Default is False

    Returns: a dataframe with the new output clean containing clean values

    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                file_path, df))
    symbols = list(symbols)

    if output_column is None:
        output_column = '{}_clean'.format(column)
    if file_path:
        df = pd.read_csv(file_path)

    df[output_column] = df[column].map(
        lambda x: string_clean(x, symbols, replace_by_space, keep_original))
    return df
def create_singleton_feature(output_column, file_path=None, df=None):
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                "file_path", "df"))

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    ffv = FFV()
    if not (ffv.is_candidates_file(df)):
        raise UnsupportTypeError("The input file is not a candidate file!")

    exact_match_count = df[df['method'] == 'exact-match'].groupby(
        ['column', 'row'])[['kg_id']].count()
    exact_match_singleton = list(
        exact_match_count[exact_match_count['kg_id'] == 1].index)

    df[output_column] = df.apply(lambda x: is_singleton(
        x.column, x.row, exact_match_singleton, x.method),
                                 axis=1)
    return df
Exemple #19
0
def combine_linearly(weights, output_column='ranking_score', file_path=None, df=None):
    """
    combines two or more score-columns for candidate knowledge graph objects for each input cell value. Takes as input weights
    for columns being combined to adjust influence.

    Args:
        weights: a comma separated string, in the format <score-column-1>:<weight-1>,<score-column-2>:<weight-2>,...
        representing weights for each score-column. Default weight for each score-column is 1.0.
        output_column: the output column name where the linearly combined scores will be stored. Default is ranking_score
        file_path: input file path
        df: input dataframe

    Returns: a dataframe in ranking score file format

    """
    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format('file_path', 'df'))

    column_weights = {}
    if weights is not None:
        c_ws = weights.split(',')
        for c_w in c_ws:
            _ = c_w.split(':')
            if len(_) > 1:
                column_weights[_[0]] = float(_[1])
            else:
                column_weights[_[0]] = 1.0

    if file_path:
        df = pd.read_csv(file_path, dtype=object)

    # fill up na to 0 for computation
    target_column_names = list(column_weights.keys())
    df[target_column_names] = df[target_column_names].fillna(0)

    df[output_column] = df.apply(lambda row: linear_combination(row, column_weights), axis=1)
    return df
Exemple #20
0
    def __init__(self,
                 df: pd.DataFrame,
                 sort_by_gt: bool = False,
                 gt_score_column: str = None,
                 output_path: str = None):
        if not output_path:
            raise RequiredInputParameterMissingException(
                "output path must be given.")

        self.df = df
        self._preprocess()
        if sort_by_gt:
            self.df = self.sort_by_gt(gt_score_column)
        self.writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
        self.workbook = self.writer.book
        self.worksheet = self.workbook.add_worksheet('Sheet1')
        self._write_to_excel()

        self.parts = []
        for key, each_group in self.df.groupby(["column", "row"]):
            each_part_range = [
                each_group.index[0] + 1, each_group.index[-1] + 1
            ]
            self.parts.append(each_part_range)
Exemple #21
0
    def get_matches(self,
                    column,
                    size=50,
                    file_path=None,
                    df=None,
                    output_column_name: str = "retrieval_score"):
        """
        uses KGTK search API to retrieve identifiers of KG entities matching the input search term.

        Args:
            column: the column used for retrieving candidates.
            size: maximum number of candidates to retrieve, default is 50.
            file_path: input file in canonical format
            df: input dataframe in canonical format,
            output_column_name: the output column name where the normalized scores will be stored.Default is kgtk_retrieval_score
        Returns: a dataframe in candidates format

        """
        if file_path is None and df is None:
            raise RequiredInputParameterMissingException(
                'One of the input parameters is required: {} or {}'.format(
                    "file_path", "df"))

        if file_path:
            df = pd.read_csv(file_path, dtype=object)

        df.fillna(value="", inplace=True)
        columns = df.columns

        uniq_labels = list(df[column].unique())

        results_dict = {}
        for uniq_label in uniq_labels:
            api_search_url = f"{self.api_url}/" \
                             f"{uniq_label}?extra_info=true&language=en&type=ngram&size={size}&lowercase=true"
            results_dict[uniq_label] = requests.get(api_search_url,
                                                    verify=False).json()

        new_df_list = list()
        seen_dict = {}
        for i, row in df.iterrows():
            row_key = f"{row['column']}_{row['row']}_{row[column]}"
            if row_key not in seen_dict:
                search_results = results_dict.get(row[column], [])
                if len(search_results) > 0:
                    for sr in search_results:
                        _ = {}
                        for c in columns:
                            _[c] = row[c]

                        _['kg_id'] = sr['qnode']
                        _['pagerank'] = sr['pagerank']
                        kg_label = []
                        kg_description = ''

                        if 'label' in sr and len(sr['label']) > 0:
                            kg_label.extend(sr['label'])
                        if 'alias' in sr and len(sr['alias']) > 0:
                            kg_label.extend(sr['alias'])
                        _['kg_labels'] = "|".join(kg_label)

                        _['method'] = 'kgtk-search'

                        if 'description' in sr and len(sr['description']) > 0:
                            kg_description = "|".join(sr['description'])
                        _['kg_descriptions'] = kg_description

                        _[output_column_name] = sr['score']
                        new_df_list.append(_)
                else:
                    _ = {}
                    for c in columns:
                        _[c] = row[c]

                    _['kg_id'] = ''
                    _['pagerank'] = ''
                    _['kg_labels'] = ''
                    _['method'] = ''
                    _['kg_descriptions'] = ''
                    _[output_column_name] = ''
                    new_df_list.append(_)
                seen_dict[row_key] = 1

        if self.ffv.is_canonical_file(df):
            return pd.DataFrame(new_df_list)

        if self.ffv.is_candidates_file(df):
            return pd.concat([df, pd.DataFrame(new_df_list)
                              ]).sort_values(by=['column', 'row', column])

        raise UnsupportTypeError(
            "The input file is neither a canonical file or a candidate file!")
Exemple #22
0
def metrics(column,
            file_path=None,
            df=None,
            k: typing.Union[int, typing.List[int]] = 1,
            tag=""):
    """
    computes the precision, recall and f1 score for the tl pipeline.

    Args:
        column: column with ranking score
        file_path: input file path
        df: or input dataframe
        k: calculate recall at top k candidates
        tag: a tag to use in the output file to identify the results of running the given pipeline

    Returns:

    """
    # always ensure k is a list
    if isinstance(k, int):
        k = [k]

    if file_path is None and df is None:
        raise RequiredInputParameterMissingException(
            'One of the input parameters is required: {} or {}'.format(
                'file_path', 'df'))

    if file_path:
        df = read_csv(file_path, dtype=object)

    # remove duplicate candidates if exist
    df = normalize_scores.drop_duplicate("kg_id", [column], df=df)

    # replace na to 0.0
    df[column] = df[column].astype(float).fillna(0.0)
    df['max_score'] = df.groupby(by=['column', 'row'])[column].transform(max)

    # relevant df
    rdf = df[df['evaluation_label'] != '0']

    # true positive for precision at 1
    tp_ps = []

    # true positive for recall at k
    tp_rs = defaultdict(list)

    grouped = rdf.groupby(by=['column', 'row'])
    n = len(grouped)
    for key, gdf in grouped:
        gdf = gdf.sort_values(by=[column, 'kg_id'],
                              ascending=[False, True]).reset_index()

        for i, row in gdf.iterrows():
            if (row['evaluation_label'] == '1' or row['evaluation_label']
                    == 1.0) and row[column] == row['max_score']:
                tp_ps.append(key)

            # this df is sorted by score, so highest ranked candidate is rank 1 and so on...
            rank = i + 1
            for each_k in k:
                # get multiple k in one time
                if rank <= each_k and (row['evaluation_label'] == '1'
                                       or row['evaluation_label'] == 1.0):
                    tp_rs[each_k].append(key)

    precision = float(len(tp_ps)) / float(n)
    recall = {
        k: float(len(each_tp_rs)) / float(n)
        for k, each_tp_rs in tp_rs.items()
    }
    # sort as k value increasing
    recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])}
    result_dict = {}

    # combine all things and output
    i = 0
    for k, each_recall in recall.items():
        if precision == 0 and each_recall == 0:
            f1_score = 0.0
        else:
            f1_score = (2 * precision * each_recall) / (precision +
                                                        each_recall)
        result_dict[i] = {
            "k": k,
            'f1': f1_score,
            'precision': precision,
            'recall': each_recall,
            'tag': tag
        }
        i += 1

    output_df = pd.DataFrame.from_dict(result_dict, orient="index")
    return output_df