def __init__(self, **kwargs): """ initialize the qnodes_dict as original tfidf required input, it is a dict with key: Q node id value: list of edges in format "property#node2" :param kwargs: """ self.input_df = pd.read_csv(kwargs['input_file'], dtype=object) self.output_col_name = kwargs["output_column_name"] self.similarity_column = kwargs["similarity_column"] if self.similarity_column not in self.input_df.columns: raise RequiredColumnMissingException( "Similarity column {} does not exist in input.".format( self.similarity_column)) self.es = Search(kwargs["url"], kwargs["index"], es_user=kwargs.get("user"), es_pass=kwargs.get("password")) self.qnodes_dict = {} nodes_candidates = self.input_df["kg_id"].dropna().unique().tolist() for each in self.es.get_node_info(nodes_candidates): node_id = each["_source"]["id"] node_edges_info = each["_source"]["edges"] self.qnodes_dict[node_id] = node_edges_info # properties_classes_map is a dict mapping the P nodes or Q nodes to unique integer id (starting from 0) self.properties_classes_map = self.create_all_properties_classes_map()
def __init__(self, parameters): self.vectors_map = {} self.sentence_map = {} self.kwargs = parameters self.loaded_file = None self.kgtk_format_input = None self.centroid = {} self.groups = defaultdict(set) self.es = Search(self.kwargs["url"], self.kwargs["index"], es_user=self.kwargs.get("user"), es_pass=self.kwargs.get("password"))
def __init__(self, es_url, es_index, es_user=None, es_pass=None, output_column_name: str = "retrieval_score"): self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass) self.utility = Utility(self.es, output_column_name)
def __init__(self, es_url, es_index, es_user=None, es_pass=None, score_column_name: str = "retrieval_score", previous_match_column_name: str = "retrieval_score"): self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass) self.utility = Utility(self.es, score_column_name, previous_match_column_name)
class TFIDF(object): def __init__(self, **kwargs): """ initialize the qnodes_dict as original tfidf required input, it is a dict with key: Q node id value: list of edges in format "property#node2" :param kwargs: """ self.input_df = pd.read_csv(kwargs['input_file'], dtype=object) self.output_col_name = kwargs["output_column_name"] self.similarity_column = kwargs["similarity_column"] if self.similarity_column not in self.input_df.columns: raise RequiredColumnMissingException( "Similarity column {} does not exist in input.".format( self.similarity_column)) self.es = Search(kwargs["url"], kwargs["index"], es_user=kwargs.get("user"), es_pass=kwargs.get("password")) self.qnodes_dict = {} nodes_candidates = self.input_df["kg_id"].dropna().unique().tolist() for each in self.es.get_node_info(nodes_candidates): node_id = each["_source"]["id"] node_edges_info = each["_source"]["edges"] self.qnodes_dict[node_id] = node_edges_info # properties_classes_map is a dict mapping the P nodes or Q nodes to unique integer id (starting from 0) self.properties_classes_map = self.create_all_properties_classes_map() @staticmethod def get_properties_classes_for_qnode(edges): properties_classes_set = set() for wd_prop_val in edges: edge, value = wd_prop_val.split('#', 1) if len(value) > 6 and value[:3] == '"""' and value[-3:] == '"""': value = value[3:-3] elif len(value) > 2: if value[0] == "'" and value[-1] == "'": value = value[1:-1] elif value[0] == '"' and value[-1] == '"': value = value[1:-1] # add edges properties_classes_set.add(edge) # if "isinstance" if edge == 'P31': properties_classes_set.add(value) return properties_classes_set def create_all_properties_classes_map(self): # map each properties to a corresponding unique number id properties_classes_set = set() for qnode in self.qnodes_dict: v = self.qnodes_dict[qnode] properties_classes_set.update( self.get_properties_classes_for_qnode(v)) return {p: idx for idx, p in enumerate(properties_classes_set)} def create_feature_vector_dict(self, label_candidates_dict): # creates input for tfidf computation feature_vector_dict = {} _p_c_len = len(self.properties_classes_map) for label, candidates in label_candidates_dict.items(): feature_vector_dict[label] = {} for candidate in candidates: feature_vector = [0] * _p_c_len if candidate in self.qnodes_dict: prop_class_list = self.get_properties_classes_for_qnode( self.qnodes_dict[candidate]) for _p_c in prop_class_list: if _p_c in self.properties_classes_map: feature_vector[ self.properties_classes_map[_p_c]] = 1 feature_vector_dict[label][candidate] = feature_vector return feature_vector_dict def compute_tfidf(self): """ Compute TF/IDF for all candidates. Args: candidates: ``` { e1: { q1: [f1, f2, f3], q2: [f1, f2, f3] }, 'e2': ... } ``` `[f1, f2, f3]` is feature vector. All vectors should have same length. feature_count: Length of feature vector. high_preision_candidates: `{e1: q1, e2: q2}`. If None, all qnodes will be used to compute tf. Returns: ``` { e1: {q1: 1.0, q2: 0.9}, e2: {q3: 0.1} } """ label_candidates_dict = defaultdict(list) high_precision_candidates = defaultdict(set) for _, each in self.input_df.iterrows(): if isinstance(each["kg_id"], str) and each["kg_id"] != "": label_candidates_dict[each["label"]].append(each["kg_id"]) if each["method"] == "exact-match": high_precision_candidates[each["label"]].add(each["kg_id"]) candidates = self.create_feature_vector_dict(label_candidates_dict) feature_count = len(self.properties_classes_map) tfidf_values = [{ 'tf': 0, 'df': 0, 'idf': 0 } for _ in range(feature_count)] corpus_num = sum(len(qs) for _, qs in candidates.items()) # get normalized similarity score similarity_score_col = self.input_df[self.similarity_column].astype( float) max_score = max(similarity_score_col) min_score = min(similarity_score_col) temp = self.input_df.copy() if max_score != 1.0 or min_score < 0: score_range = max_score - min_score temp[ "||similarity_score_col_normalized||"] = similarity_score_col.apply( lambda x: (x - min_score) / score_range) else: temp["||similarity_score_col_normalized||"] = similarity_score_col similarity_score_dict = {} for _, each_row in temp.iterrows(): similarity_score_dict[( each_row["label"], each_row["kg_id"] )] = each_row["||similarity_score_col_normalized||"] # compute tf for f_idx in range(feature_count): for e in candidates: for q, v in candidates[e].items(): if high_precision_candidates.get( e) and q in high_precision_candidates[e]: if v[f_idx] == 1: tfidf_values[f_idx]['tf'] += 1 else: tfidf_values[f_idx]['tf'] += 1 # compute df for f_idx in range(feature_count): for e in candidates: for q, v in candidates[e].items(): if v[f_idx] == 1: tfidf_values[f_idx]['df'] += 1 # compute idf for f_idx in range(len(tfidf_values)): if tfidf_values[f_idx]['df'] == 0: tfidf_values[f_idx]['idf'] = 0 else: tfidf_values[f_idx]['idf'] = math.log( float(corpus_num) / tfidf_values[f_idx]['df'], 10) # compute final score ret = {} for e in candidates: for q, v in candidates[e].items(): ret[q] = 0 for f_idx in range(feature_count): ret[q] += tfidf_values[f_idx]['tf'] * tfidf_values[f_idx]['idf'] * v[f_idx] \ * similarity_score_dict.get((e, q), 1) output_df = self.input_df.copy() output_df[self.output_col_name] = output_df['kg_id'].map(ret) return output_df
def __init__(self, es_url, es_index, es_user, es_pass, properties, output_column_name): self.properties = properties self.es = Search(es_url, es_index, es_user=es_user, es_pass=es_pass) self.utility = Utility(self.es, output_column_name)
class EmbeddingVector: """ a class support embedding vectors ranking operations """ def __init__(self, parameters): self.vectors_map = {} self.sentence_map = {} self.kwargs = parameters self.loaded_file = None self.kgtk_format_input = None self.centroid = {} self.groups = defaultdict(set) self.es = Search(self.kwargs["url"], self.kwargs["index"], es_user=self.kwargs.get("user"), es_pass=self.kwargs.get("password")) def load_input_file(self, input_file): """ read the input file """ self.loaded_file = pd.read_csv(input_file, dtype=object) self._to_kgtk_test_format() def _to_kgtk_test_format(self): """ wrap input file to kgtk format input :return: """ # remove evaluation label equals to 0 (which means no ground truth) self.groups = defaultdict(set) if "evaluation_label" in self.loaded_file.columns: self.loaded_file = self.loaded_file[ self.loaded_file['evaluation_label'] != '0'] all_info = {} count = 0 correspond_key = { "label_clean": "label", "kg_id": "candidates", "GT_kg_id": "kg_id" } for i, each_part in self.loaded_file.groupby(["column", "row"]): info = {} for each_choice in correspond_key.keys(): if each_choice in each_part.columns: temp = list(set(each_part[each_choice].unique())) temp_filtered = [] for each in temp: if each != "" and not isinstance(each, float): temp_filtered.append(each) info[correspond_key[each_choice]] = temp_filtered else: info[correspond_key[each_choice]] = [] if len(info['kg_id']) > 1 or len(info['label']) > 1: Utility.eprint( "WARNING: pair {} has multiple ground truths?".format(i)) self.groups[i[0]].update(info["candidates"]) self.groups[i[0]].update(info["kg_id"]) info["label"] = info["label"][0] if len(info["kg_id"]) > 0: info["kg_id"] = info["kg_id"][0] else: info["kg_id"] = " " info["candidates"] = "|".join(info["candidates"]) all_info[count] = info count += 1 self.kgtk_format_input = pd.DataFrame.from_dict(all_info, orient='index') def process_vectors(self): """ apply corresponding vector strategy to process the calculated vectors :return: """ vector_strategy = self.kwargs.get("column_vector_strategy", "exact-matches") if vector_strategy == "page-rank": self._calculate_page_rank() elif vector_strategy == "page-rank-precomputed": self._get_precomputed_page_rank() else: self._get_centroid(vector_strategy) def _generate_graph(self): """ function used to calculate page rank :return: """ Utility.eprint("start calculating page rank, it may take some time.") import networkx as nx # calculate probability to next stage # calculate probability base on columns col_memo = {} nodes_memo = {} graph_memo = {} similarity_memo = {} for col_number, each_part in self.loaded_file.groupby(["column"]): # first calculate all distance for memo all_nodes = set(each_part['kg_id']) - {"", np.nan} all_nodes_list = list(all_nodes) for i, each_node in enumerate(all_nodes): col_memo[each_node] = col_number for i in range(len(all_nodes_list)): for j in range(i + 1, len(all_nodes_list)): similarity = self.compute_distance( self.vectors_map[all_nodes_list[i]], self.vectors_map[all_nodes_list[j]]) similarity_memo[(all_nodes_list[i], all_nodes_list[j])] = similarity similarity_memo[(all_nodes_list[j], all_nodes_list[i])] = similarity similarity_graph = nx.DiGraph() similarity_graph.add_nodes_from(all_nodes) graph_memo[col_number] = similarity_graph nodes_memo[col_number] = all_nodes for i, each_row in self.kgtk_format_input.iterrows(): each_surface = each_row["candidates"].split("|") if len(each_surface) > 0: for each_node_i in each_surface: if each_node_i == "": continue col_number = col_memo[each_node_i] all_nodes_set = nodes_memo[col_number] remained_nodes = all_nodes_set - set(each_surface) # calculate sum score first sum_score = 0 for each_node_j in remained_nodes: sum_score += similarity_memo[(each_node_i, each_node_j)] for each_node_j in remained_nodes: # pos = (pos_memo[each_node_i], pos_memo[each_node_j]) each_weight = similarity_memo[ (each_node_i, each_node_j)] / sum_score graph_memo[col_number].add_edge(each_node_i, each_node_j, weight=each_weight) return graph_memo def _calculate_page_rank(self): import networkx as nx # just get initial page rank to do filtering weights_original = {} graph_memo = self._generate_graph() for each_graph in graph_memo.values(): weights_original.update(dict(each_graph.degree(weight='weight'))) self.loaded_file['|pr|'] = self.loaded_file['kg_id'].map( weights_original) from tl.features.normalize_scores import drop_by_score self.loaded_file = drop_by_score(column="|pr|", df=self.loaded_file, k=20) # also we need to update kgtk format input self._to_kgtk_test_format() # create the graph again base on filtered result res = {} graph_memo = self._generate_graph() # it seems pagerank_numpy runs quickest for each_graph in graph_memo.values(): res.update(nx.pagerank_numpy(each_graph, alpha=0.9)) self.loaded_file['|pr|'] = self.loaded_file['kg_id'].map(res) def _get_precomputed_page_rank(self): """ get the precomputed pagerank from whole wikidata graph :return: """ pageranks = { k: v[0] if len(v) > 0 else 0 for k, v in self.es.search_node_pagerank( self.loaded_file['kg_id'].dropna().unique().tolist()).items() } self.loaded_file["|pr|"] = self.loaded_file['kg_id'].map( pageranks).fillna(0) def _get_centroid(self, vector_strategy: str): """ function used to calculate the column-vector(centroid) value """ n_value = int(self.kwargs.pop("n_value")) if vector_strategy == "ground-truth": if "GT_kg_id" not in self.loaded_file: raise TLException( "The input file does not have `GT_kg_id` column! Can't run with ground-truth " "strategy") candidate_nodes = list(set(self.loaded_file["GT_kg_id"].tolist())) elif vector_strategy == "exact-matches": candidate_nodes = list(set(self.loaded_file["kg_id"].tolist())) else: raise TLException( "Unknown vector vector strategy {}".format(vector_strategy)) candidate_nodes = [ each for each in candidate_nodes if each != "" and each is not np.nan ] # get corresponding column of each candidate nodes nodes_map = defaultdict(set) for each_node in candidate_nodes: for group, nodes in self.groups.items(): if each_node in nodes: nodes_map[group].add(each_node) # random sample nodes if needed nodes_map_updated = {} for group, nodes in nodes_map.items(): if n_value != 0 and n_value < len(nodes): nodes_map_updated[group] = random.sample(nodes, n_value) else: nodes_map_updated[group] = nodes # get centroid for each column for group, nodes in nodes_map_updated.items(): temp = [] for each_node in sorted(list(nodes)): temp.append(self.vectors_map[each_node]) each_centroid = np.mean(np.array(temp), axis=0) self.centroid[group] = each_centroid def compute_distance(self, v1: typing.List[float], v2: typing.List[float]): if self.kwargs["distance_function"] == "cosine": val = 1 - cosine(v1, v2) elif self.kwargs["distance_function"] == "euclidean": val = euclidean(v1, v2) # because we need score higher to be better, here we use the reciprocal value if val == 0: val = float("inf") else: val = 1 / val else: raise TLException("Unknown distance function {}".format( self.kwargs["distance_function"])) return val def add_score_column(self): score_column_name = self.kwargs["output_column_name"] if score_column_name is None: score_column_name = "score_{}".format( self.kwargs["column_vector_strategy"]) i = 1 while score_column_name in self.loaded_file: i += 1 score_column_name = "score_{}_{}".format( self.kwargs["column_vector_strategy"], i) if self.kwargs["column_vector_strategy"] in { "page-rank", "page-rank-precomputed" }: self.loaded_file = self.loaded_file.rename( columns={'|pr|': score_column_name}) else: scores = [] for i, each_row in self.loaded_file.iterrows(): # the nan value can also be float if (isinstance(each_row["kg_id"], float) and math.isnan( each_row["kg_id"])) or each_row["kg_id"] is np.nan: each_score = "" else: each_score = self.compute_distance( self.centroid[each_row["column"]], self.vectors_map[each_row["kg_id"]]) scores.append(each_score) self.loaded_file[score_column_name] = scores if self.kwargs["save_embedding_feature"]: self.loaded_file['sentence'] = self.loaded_file['kg_id'].map( self.sentence_map) self.loaded_file['vector'] = self.loaded_file['kg_id'].map( self.vectors_map) if self.kwargs["ignore_empty_sentences"]: # remove sentences which is same as kg ids self.loaded_file = self.loaded_file[ self.loaded_file['kg_id'] != self.loaded_file['sentence']. apply(lambda x: x[:-1] if isinstance(x, str) else x)] def _create_detail_has_properties(self): """ By loading the property file, remove unnecessary things and get something inside if needed :return: None """ model_file_path = os.path.join( repr(__file__).replace("'", "").replace("/text_embedding.py", ""), "predicate_counts_and_labels.tsv") if os.path.exists(model_file_path): properties_df = pd.read_csv(model_file_path, sep='\t') else: return # process need_isa_properties = {"P31"} need_has_properties = set() for _, each_row in properties_df.iterrows(): if not isinstance(each_row["label"], str) and np.isnan( each_row["label"]): continue if each_row["operation"] == "check_inside" or each_row[ "label"].endswith("of'@en"): need_isa_properties.add(each_row["predicate"]) continue elif each_row["operation"] == "bl": continue else: if "ID" in each_row["label"] or \ "identifier" in each_row["label"].lower() or \ "common" in each_row["label"].lower(): continue need_has_properties.add(each_row["predicate"]) self.kwargs["has_properties"] = list(need_has_properties) self.kwargs["isa_properties"] = list(need_isa_properties) def get_vectors(self): """ send the table linker format data to kgtk vector embedding the load the output and get the vector map """ # no vector calculation needed for precomputed pagerank if self.kwargs.get( "column_vector_strategy") == "page-rank-precomputed": return # transform format to kgtk format input temp_file = tempfile.NamedTemporaryFile(mode='r+', suffix=".csv") self.kgtk_format_input.to_csv(temp_file, index=False) temp_file.seek(0) self.kwargs["input_file"] = Path(temp_file.name) self.kwargs["input_format"] = "test_format" self.kwargs["_debug"] = self.kwargs["debug"] self.kwargs["output_uri"] = "none" self.kwargs["use_cache"] = True # always send true to kgtk, so that we can get the sentences to check if they are empty or not self.kwargs["save_embedding_sentence"] = True if self.kwargs["has_properties"] == ["all"] and self.kwargs["isa_properties"] == ["P31"] \ and self.kwargs["use_default_file"]: self._create_detail_has_properties() # catch the stdout to string old_stdout = sys.stdout sys.stdout = output_vectors = StringIO() main_embedding_function(**self.kwargs) sys.stdout = old_stdout # read the output vectors output_vectors.seek(0) _ = output_vectors.readline() for each_line in output_vectors.readlines(): each_line = each_line.replace("\n", "").split("\t") each_q = each_line[0] each_edge = each_line[1] if each_edge == "embedding_sentence": each_sentence = each_line[2] self.sentence_map[each_q] = each_sentence else: each_vector = np.array( [float(each_v) for each_v in each_line[2].split(",")]) self.vectors_map[each_q] = each_vector # save kgtk output vector file if needed if self.kwargs["projector_file_name"] is not None: self.save_vector_file(output_vectors) output_vectors.close() def save_vector_file(self, vector_io): output_path = self.kwargs["projector_file_name"] if "/" not in output_path: output_path = os.path.join(os.getcwd(), output_path) vector_io.seek(0) with open(output_path, "w") as f: f.writelines(vector_io.readlines()) def print_output(self): self.loaded_file.to_csv(sys.stdout, index=False)