def mosaic_features(label_column, num_char, num_tokens, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") if not (num_char) and not (num_tokens): raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "num_char", "num_tokens")) if num_char: df['num_char'] = df[label_column].apply(lambda label: len(label) if not (pd.isna(label)) else 0) if num_tokens: df['num_tokens'] = df[label_column].apply( lambda label: len(label.split()) if not (pd.isna(label)) else 0) return df
def generate_reciprocal_rank(score_column, output_column, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format("file_path", "df")) if score_column is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {}'.format('score_column')) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna("", inplace=True) df = df.astype(dtype={score_column: "float64"}) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") final_list = [] grouped_obj = df.groupby(['row', 'column']) for cell in grouped_obj: reciprocal_rank = list(1/cell[1][score_column].rank(method='first',ascending=False)) cell[1][output_column] = reciprocal_rank final_list.extend(cell[1].to_dict(orient='records')) odf = pd.DataFrame(final_list) return odf
def predict(features, output_column, ranking_model, min_max_scaler_path, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") if not (ranking_model) and not (normalization_factor): raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "ranking_model", "normalization_factor")) model = PairwiseNetwork(14) model.load_state_dict(torch.load(ranking_model)) scaler = pickle.load(open(min_max_scaler_path, 'rb')) normalize_features = features.split(",") df[normalize_features] = df[normalize_features].astype('float64') grouped_obj = df.groupby(['column', 'row']) new_df_list = [] pred = [] for cell in grouped_obj: cell[1][normalize_features] = scaler.transform( cell[1][normalize_features]) df_copy = cell[1].copy() df_features = df_copy[normalize_features] new_df_list.append(df_copy) arr = df_features.to_numpy() test_inp = [] for a in arr: test_inp.append(a) test_tensor = torch.tensor(test_inp).float() scores = model.predict(test_tensor) pred.extend(torch.squeeze(scores).tolist()) out_df = pd.concat(new_df_list) out_df[output_column] = pred return out_df
def get_exact_matches(self, column, properties="labels,aliases", size=50, file_path=None, df=None): """ retrieves the identifiers of KG entities whose label or aliases match the input values with some edit distance allowed. Args: column: the column used for retrieving candidates. properties: a comma separated names of properties in the KG to search for exact match query: default is labels,aliases size: maximum number of candidates to retrieve, default is 50. file_path: input file in canonical format df: input dataframe in canonical format Returns: a dataframe in candidates format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) return self.utility.create_candidates_df(df, column, size, properties, 'fuzzy-match')
def __init__(self, output_column_name, feature_file, feature_name, total_docs, singleton_column, input_file=None, df=None): """ initialize the qnodes_dict as original tfidf required input, it is a dict with key: Q node id value: list of edges in format "property#node2" :param kwargs: """ if df is None and input_file is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "input_file", "df")) if input_file is not None: self.input_df = pd.read_csv(input_file) elif df is not None: self.input_df = df self.input_df = self.input_df.sort_values(['column', 'row']) self.output_col_name = output_column_name self.N = float(total_docs) self.feature_dict, self.feature_count_dict = self.build_qnode_feature_dict( feature_file, feature_name) self.feature_idf_dict = self.calculate_idf_features() self.singleton_column = singleton_column
def drop_by_score(column, file_path=None, df=None, k=20): """ group the dataframe by column, row and then drop the candidates out of given amount k from highest score to lowest score Args: column: column with ranking score file_path: input file path df: or input dataframe k: top k candidates Returns: filtered dataframe """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format('file_path', 'df')) if file_path: df = pd.read_csv(file_path) # replace na to 0.0 df[column] = df[column].astype(float).fillna(0.0) # astype float first to prevent error of "invalid literal for int() with base 10: '0.0'" df["column"] = df["column"].astype(float).astype(int) df["row"] = df["row"].astype(float).astype(int) res = pd.DataFrame() for key, gdf in df.groupby(by=['column', 'row']): gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).iloc[:k, :] res = res.append(gdf) return res
def normalize_scores(column='retrieval_score', output_column=None, weights=None, file_path=None, df=None, norm_type=None): """ normalizes the retrieval scores for all the candidate knowledge graph objects for each retrieval method for all input cells in a column Args: column: column name which has the retrieval scores. Default is retrieval_score output_column: the output column name where the normalized scores will be stored. Default is input column name appended with the suffix _normalized weights: a comma separated string of the format <retrieval_method_1:<weight_1>, <retrieval_method_2:<weight_2> ,...> specifying the weights for each retrieval method. By default, all retrieval method weights are set to 1.0 file_path: input file path df: or input dataframe Returns: """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format("file_path", "df")) if output_column is None: output_column = '{}_normalized'.format(column) method_weights = {} if weights is not None: m_ws = weights.split(',') for m_w in m_ws: _ = m_w.split(':') method_weights[_[0]] = float(_[1]) if file_path: df = pd.read_csv(file_path, dtype=object) df[column] = df[column].map(lambda x: float(x)) grouped_df = df.groupby(by=['column', 'method']) o_df = list() if norm_type == 'max_norm': for i, gdf in grouped_df: max_score = gdf[column].max() # TODO find a better way to do this without having to make a copy fdf = gdf.copy(deep=True) fdf[output_column] = gdf[column].map(lambda x: divide_a_by_b(x, max_score) * method_weights.get(i[1], 1.0)) o_df.append(fdf) elif norm_type == 'zscore': for i, gdf in grouped_df: mean_score = gdf[column].mean() std_score = gdf[column].std() # TODO find a better way to do this without having to make a copy fdf = gdf.copy(deep=True) fdf[output_column] = gdf[column].map( lambda x: zscore_normalization(x, mean_score, std_score) * method_weights.get(i[1], 1.0)) o_df.append(fdf) out_df = Utility.sort_by_col_and_row(pd.concat(o_df)) return out_df
def canonicalize(columns, output_column='label', file_path=None, df=None, file_type='csv', add_context=False, context_column_name="context"): """ translate an input CSV or TSV file to canonical form Args: columns: the columns in the input file to be linked to KG entities. Multiple columns are specified as a comma separated string. output_column: specifies the name of a new column to be added. Default output column name is label file_path: input file path df: or input dataframe file_type: csv or tsv add_context: choose whether to add other information or not to canonicalize files context_column_name: the column name for the other information Returns: a dataframe in canonical form """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {}or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, sep=',' if file_type == 'csv' else '\t', dtype=object) columns = columns.split(',') for column in columns: if column not in df.columns: raise RequiredColumnMissingException( "The input column {} does not exist in given data.".format( column)) out = list() for i, v in df.iterrows(): for column in columns: if add_context: remaining_columns = v.keys().tolist() remaining_columns.remove(column) remaining_values = "|".join( v[remaining_columns].dropna().values.tolist()) out.append({ 'column': df.columns.get_loc(column), 'row': i, output_column: v[column], context_column_name: remaining_values }) else: out.append({ 'column': df.columns.get_loc(column), 'row': i, output_column: v[column] }) return pd.DataFrame(out).sort_values(by=['column', 'row'])
def get_exact_matches(self, column, lower_case=True, size=50, file_path=None, df=None, auxiliary_fields: List[str] = None, auxiliary_folder: str = None, isa: str = None): """ retrieves the identifiers of KG entities whose label or aliases match the input values exactly. Args: column: the column used for retrieving candidates. properties: a comma separated names of properties in the KG to search for exact match query: default is labels,aliases lower_case: case insensitive retrieval, default is case sensitive. size: maximum number of candidates to retrieve, default is 50. file_path: input file in canonical format df: input dataframe in canonical format Returns: a dataframe in candidates format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) extra_musts = None if isa: extra_musts = { "term": { "instance_ofs.keyword_lower": { "value": isa.lower() } } } properties = "all_labels.en" return self.utility.create_candidates_df( df, column, size, properties, 'exact-match', lower_case=lower_case, auxiliary_fields=auxiliary_fields, auxiliary_folder=auxiliary_folder, auxiliary_file_prefix='exact_matches_', extra_musts=extra_musts)
def vote_by_classifier(model_file, input_file=None, df=None, prob_threshold=0.995): if input_file is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "input_file", "df")) if not model_file: raise RequiredInputParameterMissingException( 'Model path cannot be None') if input_file: df = pd.read_csv(input_file, dtype=object) features_list = [ 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized' ] for ft in features_list: assert ft in df, f'There\'s no {ft} column in the table!' with open(model_file, 'rb') as fid: model_loaded = pickle.load(fid) try: prob_threshold = float(prob_threshold) except: prob_threshold = 0 # make prediction on target file odf = df.copy() test_features = df.loc[:, [ 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized' ]] prob = model_loaded.predict_proba(test_features) df['prob_1'] = [p[1] for p in prob] odf['vote_by_classifier'] = (df['prob_1'] > prob_threshold).astype(int) return odf
def drop_duplicate(column: str, score_col: typing.List[str], keep_method: str = None, file_path: str = None, df: pd.DataFrame = None): """ group the dataframe by column, row and then check if there are duplicate rows on given column, remove the duplicated one and only keep the highest score one Args: column: column with labels score_col: column with ranking scores keep_method: the method need to keep file_path: input file path df: or input dataframe Returns: filtered dataframe """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format('file_path', 'df')) if file_path: df = pd.read_csv(file_path) for each_col in [column] + score_col: if each_col not in df.columns: raise RequiredColumnMissingException("Column {} does not exist in given dataframe.".format(each_col)) # replace na to 0.0 df[score_col] = df[score_col].astype(float).fillna(0.0) # astype float first to prevent error of "invalid literal for int() with base 10: '0.0'" df["column"] = df["column"].astype(float).astype(int) df["row"] = df["row"].astype(float).astype(int) res = [] for key, gdf in df.groupby(by=['column', 'row']): # for those nodes with no candidates, we need to check here temp = gdf[column].unique() if len(temp) == 1 and not isinstance(temp[0], str) and np.isnan(temp[0]): res.append(gdf.iloc[0].to_dict()) continue for candidate_id, candidate_df in gdf.groupby(by=[column]): if len(candidate_df) > 1: # only do keep method when the method specified exists if keep_method is not None and keep_method in candidate_df["method"].unique(): candidate_df = candidate_df[candidate_df["method"] == keep_method] if score_col and len(candidate_df) > 1: candidate_df = candidate_df.sort_values(by=score_col, ascending=[False]).iloc[:1, :] res.append(candidate_df.iloc[0].to_dict()) # sometimes the column order may changed, resort it to ensure follow original order res = pd.DataFrame(res) res = res.reindex(columns=df.columns) return res
def get_matches(self, column, size=100, file_path=None, df=None, auxiliary_fields: List[str] = None, auxiliary_folder: str = None, isa: str = None): """ Used the ElasticSearch which has the labels, aliases, wikipedia/wikitable anchor text, redirect text :param column: the column used to retrieve the candidates :param size: the size of the candidates that need to retrieved by the two queries :param file_path: input file in canonical format :param df: input dataframe in canonical format :param output_column_name: the output column name where the retrieval scores are stored :return: a dataframe in candidates format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) properties = self.properties extra_musts = None if isa: extra_musts = { "term": { "instance_ofs.keyword_lower": { "value": isa.lower() } } } return self.utility.create_candidates_df( df, column, size, properties, 'fuzzy-augmented', lower_case=False, auxiliary_fields=auxiliary_fields, auxiliary_folder=auxiliary_folder, auxiliary_file_prefix='fuzzy_augmented_', extra_musts=extra_musts)
def extract_ground_truth(target_column, kg_id_column, kg_label_column, file_path=None, df=None, file_type='csv'): """ Returns ground truth dataframe by extracting columns from input dataframe Args: target_column: the column in the input file to be linked to KG entities kg_id_column: the column in the input file containing the kg identifier kg_label_column: the column in the input file containing the kg label file_path: input file path df: or input dataframe file_type: csv or tsv Returns: ground truth dataframe in canonical format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {}or {}'.format( "file_path", "df")) for column in [target_column, kg_id_column, kg_label_column]: if column not in df.columns: raise RequiredColumnMissingException( "The input column {} does not exist in given data.".format( column)) if file_path: df = pd.read_csv(file_path, sep=',' if file_type == 'csv' else '\t', dtype=object) target_column_index = df.columns.get_loc(target_column) out = list() for i, v in df.iterrows(): out.append({ 'column': target_column_index, 'row': i, 'kg_id': v[kg_id_column], 'kg_label': v[kg_label_column] }) return pd.DataFrame(out).sort_values(by=['column', 'row'])
def ground_truth_labeler(gt_file_path, file_path=None, df=None): """ compares each candidate for the input cells with the ground truth value for that cell and adds an evaluation label. Args: gt_file_path: ground truth file path. column: column name with ranking scores file_path: input file path df: or input dataframe Returns: a dataframe with added column `evaluation_label` """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( 'file_path', 'df')) gt_df = read_csv(gt_file_path, dtype=object) gt_df.rename(columns={ 'kg_id': 'GT_kg_id', 'kg_label': 'GT_kg_label' }, inplace=True) if file_path: df = read_csv(file_path, dtype=object) df.fillna('', inplace=True) # kyao: Use only columns defined ground truth file format evaluation_df = pd.merge( df, gt_df.loc[:, ['column', 'row', 'GT_kg_id', 'GT_kg_label']], on=['column', 'row'], how='left') evaluation_df['GT_kg_id'].fillna(value="", inplace=True) evaluation_df['GT_kg_label'].fillna(value="", inplace=True) evaluation_df['evaluation_label'] = evaluation_df.apply( lambda row: assign_evaluation_label(row), axis=1) # evaluation_df.drop(columns=['max_score'], inplace=True) return evaluation_df
def align_page_rank(input_file=None, df=None): if input_file is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "input_file", "df")) if input_file: df = pd.read_csv(input_file, dtype=object) assert 'pagerank' in df, 'There\'s no page rank column in the table!' odf = pd.DataFrame() for ((col, row), group) in df.groupby(['column', 'row']): exact_match_df = group[group['method'] == 'exact-match'].copy() exact_match_df['aligned_pagerank'] = exact_match_df['pagerank'].astype( float) odf = odf.append(exact_match_df) fuzzy_match_df = group[group['method'] == 'fuzzy-augmented'].copy() fuzzy_match_df['aligned_pagerank'] = 0 odf = odf.append(fuzzy_match_df) return odf
def get_phrase_matches(self, column, properties="labels^2,aliases", size=50, file_path=None, df=None, filter_condition=None): """ retrieves the identifiers of KG entities base on phrase match queries. Args: column: the column used for retrieving candidates. properties: a comma separated names of properties in the KG to search for exact match query: default is labels^2,aliases size: maximum number of candidates to retrieve, default is 50. file_path: input file in canonical format df: input dataframe in canonical format filter_condition: a string indicate the filter requirement Returns: a dataframe in candidates format """ need_filter = False if filter_condition is not None: need_filter = True if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format("file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) if need_filter: query_input_df = Filter.remove_previous_match_res(df) else: query_input_df = df from tl.utility.utility import Utility output_df = self.utility.create_candidates_df(query_input_df, column, size, properties, 'phrase-match') Utility.eprint(output_df) if need_filter: output_df = Filter.combine_result(df, output_df, filter_condition) return output_df
def clean(column, output_column=None, file_path=None, df=None, symbols='!@#$%^&*()+={}[]:;’\”/<>', replace_by_space=True, keep_original=False): """ cleans the cell values in a column, creating a new column with the clean values. Args: column: the column to be cleaned. output_column: the name of the column where cleaned column values are stored. If not provided, the name of the new column is the name of the input column with the suffix _clean. file_path: input file path df: or input dataframe symbols: a string containing the set of characters to be removed: default is “!@#$%^&*()+={}[]:;’\”/<>” replace_by_space: when True (default) all instances of the symbols are replaced by a space. In case of removal of multiple consecutive characters, they’ll be replaced by a single space. The value False causes the symbols to be deleted. keep_original: when True, the output column will contain the original value and the clean value will be appended, separated by |. Default is False Returns: a dataframe with the new output clean containing clean values """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( file_path, df)) symbols = list(symbols) if output_column is None: output_column = '{}_clean'.format(column) if file_path: df = pd.read_csv(file_path) df[output_column] = df[column].map( lambda x: string_clean(x, symbols, replace_by_space, keep_original)) return df
def create_singleton_feature(output_column, file_path=None, df=None): if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) ffv = FFV() if not (ffv.is_candidates_file(df)): raise UnsupportTypeError("The input file is not a candidate file!") exact_match_count = df[df['method'] == 'exact-match'].groupby( ['column', 'row'])[['kg_id']].count() exact_match_singleton = list( exact_match_count[exact_match_count['kg_id'] == 1].index) df[output_column] = df.apply(lambda x: is_singleton( x.column, x.row, exact_match_singleton, x.method), axis=1) return df
def combine_linearly(weights, output_column='ranking_score', file_path=None, df=None): """ combines two or more score-columns for candidate knowledge graph objects for each input cell value. Takes as input weights for columns being combined to adjust influence. Args: weights: a comma separated string, in the format <score-column-1>:<weight-1>,<score-column-2>:<weight-2>,... representing weights for each score-column. Default weight for each score-column is 1.0. output_column: the output column name where the linearly combined scores will be stored. Default is ranking_score file_path: input file path df: input dataframe Returns: a dataframe in ranking score file format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format('file_path', 'df')) column_weights = {} if weights is not None: c_ws = weights.split(',') for c_w in c_ws: _ = c_w.split(':') if len(_) > 1: column_weights[_[0]] = float(_[1]) else: column_weights[_[0]] = 1.0 if file_path: df = pd.read_csv(file_path, dtype=object) # fill up na to 0 for computation target_column_names = list(column_weights.keys()) df[target_column_names] = df[target_column_names].fillna(0) df[output_column] = df.apply(lambda row: linear_combination(row, column_weights), axis=1) return df
def __init__(self, df: pd.DataFrame, sort_by_gt: bool = False, gt_score_column: str = None, output_path: str = None): if not output_path: raise RequiredInputParameterMissingException( "output path must be given.") self.df = df self._preprocess() if sort_by_gt: self.df = self.sort_by_gt(gt_score_column) self.writer = pd.ExcelWriter(output_path, engine='xlsxwriter') self.workbook = self.writer.book self.worksheet = self.workbook.add_worksheet('Sheet1') self._write_to_excel() self.parts = [] for key, each_group in self.df.groupby(["column", "row"]): each_part_range = [ each_group.index[0] + 1, each_group.index[-1] + 1 ] self.parts.append(each_part_range)
def get_matches(self, column, size=50, file_path=None, df=None, output_column_name: str = "retrieval_score"): """ uses KGTK search API to retrieve identifiers of KG entities matching the input search term. Args: column: the column used for retrieving candidates. size: maximum number of candidates to retrieve, default is 50. file_path: input file in canonical format df: input dataframe in canonical format, output_column_name: the output column name where the normalized scores will be stored.Default is kgtk_retrieval_score Returns: a dataframe in candidates format """ if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( "file_path", "df")) if file_path: df = pd.read_csv(file_path, dtype=object) df.fillna(value="", inplace=True) columns = df.columns uniq_labels = list(df[column].unique()) results_dict = {} for uniq_label in uniq_labels: api_search_url = f"{self.api_url}/" \ f"{uniq_label}?extra_info=true&language=en&type=ngram&size={size}&lowercase=true" results_dict[uniq_label] = requests.get(api_search_url, verify=False).json() new_df_list = list() seen_dict = {} for i, row in df.iterrows(): row_key = f"{row['column']}_{row['row']}_{row[column]}" if row_key not in seen_dict: search_results = results_dict.get(row[column], []) if len(search_results) > 0: for sr in search_results: _ = {} for c in columns: _[c] = row[c] _['kg_id'] = sr['qnode'] _['pagerank'] = sr['pagerank'] kg_label = [] kg_description = '' if 'label' in sr and len(sr['label']) > 0: kg_label.extend(sr['label']) if 'alias' in sr and len(sr['alias']) > 0: kg_label.extend(sr['alias']) _['kg_labels'] = "|".join(kg_label) _['method'] = 'kgtk-search' if 'description' in sr and len(sr['description']) > 0: kg_description = "|".join(sr['description']) _['kg_descriptions'] = kg_description _[output_column_name] = sr['score'] new_df_list.append(_) else: _ = {} for c in columns: _[c] = row[c] _['kg_id'] = '' _['pagerank'] = '' _['kg_labels'] = '' _['method'] = '' _['kg_descriptions'] = '' _[output_column_name] = '' new_df_list.append(_) seen_dict[row_key] = 1 if self.ffv.is_canonical_file(df): return pd.DataFrame(new_df_list) if self.ffv.is_candidates_file(df): return pd.concat([df, pd.DataFrame(new_df_list) ]).sort_values(by=['column', 'row', column]) raise UnsupportTypeError( "The input file is neither a canonical file or a candidate file!")
def metrics(column, file_path=None, df=None, k: typing.Union[int, typing.List[int]] = 1, tag=""): """ computes the precision, recall and f1 score for the tl pipeline. Args: column: column with ranking score file_path: input file path df: or input dataframe k: calculate recall at top k candidates tag: a tag to use in the output file to identify the results of running the given pipeline Returns: """ # always ensure k is a list if isinstance(k, int): k = [k] if file_path is None and df is None: raise RequiredInputParameterMissingException( 'One of the input parameters is required: {} or {}'.format( 'file_path', 'df')) if file_path: df = read_csv(file_path, dtype=object) # remove duplicate candidates if exist df = normalize_scores.drop_duplicate("kg_id", [column], df=df) # replace na to 0.0 df[column] = df[column].astype(float).fillna(0.0) df['max_score'] = df.groupby(by=['column', 'row'])[column].transform(max) # relevant df rdf = df[df['evaluation_label'] != '0'] # true positive for precision at 1 tp_ps = [] # true positive for recall at k tp_rs = defaultdict(list) grouped = rdf.groupby(by=['column', 'row']) n = len(grouped) for key, gdf in grouped: gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).reset_index() for i, row in gdf.iterrows(): if (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0) and row[column] == row['max_score']: tp_ps.append(key) # this df is sorted by score, so highest ranked candidate is rank 1 and so on... rank = i + 1 for each_k in k: # get multiple k in one time if rank <= each_k and (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0): tp_rs[each_k].append(key) precision = float(len(tp_ps)) / float(n) recall = { k: float(len(each_tp_rs)) / float(n) for k, each_tp_rs in tp_rs.items() } # sort as k value increasing recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])} result_dict = {} # combine all things and output i = 0 for k, each_recall in recall.items(): if precision == 0 and each_recall == 0: f1_score = 0.0 else: f1_score = (2 * precision * each_recall) / (precision + each_recall) result_dict[i] = { "k": k, 'f1': f1_score, 'precision': precision, 'recall': each_recall, 'tag': tag } i += 1 output_df = pd.DataFrame.from_dict(result_dict, orient="index") return output_df