def get_candidates(self, table: Table) -> List[GeneratorResult]: """ This method annotates each table column separately, by finding which are the column types and the relationships between the current column and the other. :param table: a list of search_keys, which must belong to the same table column :return: a list of GeneratorResult """ col_search_keys = {} for cell in table.get_gt_cells(): if cell.col_id not in col_search_keys: col_search_keys[cell.col_id] = [] col_search_keys[cell.col_id].append(table.get_search_key(cell)) col_search_keys = {col: chunk_list(search_keys, 500) for col, search_keys in col_search_keys.items()} if self._config.max_workers == 1: results = [self._get_candidates_for_column(search_keys) for search_keys_list in col_search_keys.values() for search_keys in search_keys_list] else: with ProcessPoolExecutor(self._config.max_workers) as pool: results = pool.map(self._get_candidates_for_column, [search_keys for search_keys_list in col_search_keys.values() for search_keys in search_keys_list]) return functools.reduce(operator.iconcat, results, [])
def _tables_to_pkl(self): cea = pd.read_csv(self._gt_path('CEA'), names=['tab_id', 'col_id', 'row_id', 'entities'], dtype={ 'tab_id': str, 'col_id': int, 'row_id': int, 'entities': str }) cea['entities'] = cea['entities'].apply(str.split) cta_groups = None if os.path.exists(self._gt_path('CTA')): cta = pd.read_csv( self._gt_path('CTA'), names=['tab_id', 'col_id', 'perfect', 'okay'], dtype={ 'tab_id': str, 'col_id': int, 'perfect': str, 'okay': str }, keep_default_na=False) # the "okay" value might be empty cta['perfect'] = cta['perfect'].apply(str.split) cta['okay'] = cta['okay'].apply(str.split) cta_groups = cta.groupby('tab_id') cpa_groups = None if os.path.exists(self._gt_path('CPA')): cpa = pd.read_csv( self._gt_path('CPA'), names=['tab_id', 'source_id', 'target_id', 'properties'], dtype={ 'tab_id': str, 'source_id': int, 'target_id': int, 'properties': str }) cpa['properties'] = cpa['properties'].apply(str.split) cpa_groups = cpa.groupby('tab_id') cea_groups = cea.groupby('tab_id') for tab_id, cea_group in cea_groups: table = Table(tab_id, self.value, self._table_path(tab_id)) table.set_gt_cell_annotations( zip(cea_group['row_id'], cea_group['col_id'], cea_group['entities'])) if cta_groups and tab_id in cta_groups.groups: cta_group = cta_groups.get_group(tab_id) table.set_gt_column_annotations( zip(cta_group['col_id'], cta_group['perfect'], cta_group['okay'])) if cpa_groups and tab_id in cpa_groups.groups: cpa_group = cpa_groups.get_group(tab_id) table.set_gt_property_annotations( zip(cpa_group['source_id'], cpa_group['target_id'], cpa_group['properties'])) pickle.dump( table, open(f"{self._pickle_table_folder_path()}/{table.tab_id}.pkl", 'wb'))
def get_candidates(self, table: Table) -> List[GeneratorResult]: """ Candidate selection method. This implementation just forwards the LookupService results. :param table: a Table object :return: a list of GeneratorResult """ search_keys = [table.get_search_key(cell_) for cell_ in table.get_gt_cells()] if self._config.max_workers == 1: results = self._lookup_candidates(search_keys) else: # Parallelize at cell level (no dependencies between cells in the same col/row) with ProcessPoolExecutor(self._config.max_workers) as pool: results = pool.map(self._lookup_candidates, chunk_list(search_keys, self._config.chunk_size)) return functools.reduce(operator.iconcat, results, [])
def annotate_table(self, table: Table): folder_path = os.path.join(os.path.dirname(__file__), 'annotations', table.dataset_id, self._generator.id) Path(folder_path).mkdir(parents=True, exist_ok=True) filename = os.path.join(folder_path, '%s.pkl' % table.tab_id) # check existing result if not os.path.exists(filename): # keep the cell-search_key pair -> results may be shuffled! search_key_cell_dict = table.get_search_keys_cells_dict() if isinstance(self._generator, HybridGenerator): # try to reuse already annotated tables tables = [] for generator in self._generator.generators: subfolder_path = os.path.join(os.path.dirname(__file__), 'annotations', table.dataset_id, generator.id) subfilename = os.path.join(subfolder_path, '%s.pkl' % table.tab_id) if not os.path.exists(subfilename): break tables.append(pickle.load(open(subfilename, 'rb'))) if len(tables) == len(self._generator.generators): results = HybridGeneratorSimulator.get_candidates( *tables) # combine results from many tables else: results = self._generator.get_candidates(table) else: results = self._generator.get_candidates(table) for search_key, candidates in results: if candidates: for cell in search_key_cell_dict[search_key]: table.annotate_cell(cell, Entity( candidates[0])) # first candidate = best pickle.dump(table, open(filename, 'wb')) gc.collect() return pickle.load(open(filename, 'rb'))
def get_test_dataset(cls, size, from_dataset=None, rand=False): """ Helper method to generate a test dataset on-the-fly. :param size: dimension of the test dataset to create (# cells) :param from_dataset: dataset to sample rows from. Default: Round1 :param rand: True if the rows should be sampled randomly; otherwise, the top ``size`` rows are returned. :return: a Pandas dataframe """ if from_dataset is None: from_dataset = cls.ST19_Round1 cea = pd.read_csv(from_dataset._gt_path('CEA'), names=['tab_id', 'col_id', 'row_id', 'entities'], dtype={ 'tab_id': str, 'col_id': int, 'row_id': int, 'entities': str }) if rand: cea = cea.sample(size).reset_index() else: cea = cea[:size] cta_groups = None if os.path.exists(from_dataset._gt_path('CTA')): cta = pd.read_csv( from_dataset._gt_path('CTA'), names=['tab_id', 'col_id', 'perfect', 'okay'], dtype={ 'tab_id': str, 'col_id': int, 'perfect': str, 'okay': str }, keep_default_na=False) # the "okay" value might be empty cta['perfect'] = cta['perfect'].apply(str.split) cta['okay'] = cta['okay'].apply(str.split) cta_groups = cta.groupby('tab_id') cpa_groups = None if os.path.exists(from_dataset._gt_path('CPA')): cpa = pd.read_csv( from_dataset._gt_path('CPA'), names=['tab_id', 'source_id', 'target_id', 'properties'], dtype={ 'tab_id': str, 'source_id': int, 'target_id': int, 'properties': str }) cpa['properties'] = cpa['properties'].apply(str.split) cpa_groups = cpa.groupby('tab_id') cea_groups = cea.groupby('tab_id') tables = [] for tab_id, cea_group in cea_groups: table = Table(tab_id, f'{from_dataset.value}_test', from_dataset._table_path(tab_id)) table.set_gt_cell_annotations( zip(cea_group['row_id'], cea_group['col_id'], cea_group['entities'])) if cta_groups and tab_id in cta_groups.groups: cta_group = cta_groups.get_group(tab_id) cta_group = cta_group[cta_group['col_id'].isin( cea_group['col_id'].unique())] table.set_gt_column_annotations( zip(cta_group['col_id'], cta_group['perfect'], cta_group['okay'])) if cpa_groups and tab_id in cpa_groups.groups: cpa_group = cpa_groups.get_group(tab_id) cpa_group = cpa_group[ (cpa_group['source_id'].isin(cea_group['col_id'].unique())) & (cpa_group['target_id'].isin( cea_group['col_id'].unique()))] table.set_gt_property_annotations( zip(cpa_group['source_id'], cpa_group['target_id'], cpa_group['properties'])) tables.append(table) tmp = Enum('GTTestEnum', {'%s_TEST_%d' % (from_dataset.name, size): tables }) # create a temp enum setattr(tmp, 'get_tables', lambda x: x.value ) # add the get_df function, that returns the tables setattr(tmp, 'get_table_categories', lambda x: from_dataset.get_table_categories()) setattr(tmp, 'total_tables', lambda x: len(tables)) return list(tmp)[0]
def get_candidates(self, table: Table) -> List[GeneratorResult]: """ Return a list of candidates, sorted by the cosine distance between their label and context embeddings. :param table: a Table object :return: a list of GeneratorResult """ search_keys = [ table.get_search_key(cell_) for cell_ in table.get_gt_cells() ] lookup_results = dict(self._lookup_candidates( search_keys)) # collect lookup result from the super class # create embed for each label and context pair cached_entries, to_compute = self._get_cached_entries(search_keys) new_results = self._embed_search_keys(to_compute) self._update_cache(new_results) # write new entries to cache search_keys_embs = dict(cached_entries + new_results) # create embed for the candidates' abstracts candidates_list = functools.reduce(operator.iconcat, lookup_results.values(), []) if self._config.abstract == 'short': abstracts = self._abstract_helper.fetch_short_abstracts( candidates_list) else: abstracts = self._abstract_helper.fetch_long_abstracts( candidates_list) abstracts = { candidate: truncate_string(abstract, self._config.abstract_max_tokens) for candidate, abstract in abstracts.items() } cached_entries, to_compute = self._get_cached_entries( abstracts.values()) new_results = self._embed_abstracts(to_compute) self._update_cache(new_results) abstracts_embeddings = dict(cached_entries + new_results) # do not zip! abstracts.values() might contain duplicates... abstracts_embs = { candidate: abstracts_embeddings[abstract] for candidate, abstract in abstracts.items() } results = [] for search_key in search_keys: candidates_embeddings = [] context_emb = np.nan if search_key.context and search_keys_embs[search_key].size: context_emb = search_keys_embs[search_key] for candidate in lookup_results[search_key]: abstract_emb = np.nan if candidate in abstracts and abstracts_embs[candidate].size: abstract_emb = abstracts_embs[candidate] candidates_embeddings.append( CandidateEmbeddings(candidate, context_emb, abstract_emb)) results.append( GeneratorResult(search_key, [ c.candidate for c in weighting_by_ranking( candidates_embeddings, self._config.alpha, self._config.default_score) ])) return results
def get_candidates(self, table: Table) -> List[GeneratorResult]: col_search_keys = {} row_search_keys = {} for cell in table.get_gt_cells(): if cell.col_id not in col_search_keys: col_search_keys[cell.col_id] = [] if cell.row_id not in row_search_keys: row_search_keys[cell.row_id] = [] col_search_keys[cell.col_id].append(table.get_search_key(cell)) row_search_keys[cell.row_id].append(table.get_search_key(cell)) lookup_results_col = {} for col, search_key in col_search_keys.items(): lookup_results_col[col] = dict(self._lookup_candidates(search_key)) lookup_results_row = {} for row, search_key in row_search_keys.items(): lookup_results_row[row] = dict(self._lookup_candidates(search_key)) # Create a complete directed k-partite disambiguation graph where k is the number of search keys. disambiguation_graph_col = [] disambiguation_graph_row = [] sk_nodes_col = [] sk_nodes_row = [] personalization = {} # prepare dict for pagerank with normalized priors embeddings = {} for (col, lookup) in enumerate(lookup_results_col.values()): disambiguation_graph_col.append(nx.DiGraph()) sk_nodes_col.append(dict()) for search_key, candidates in lookup.items(): degrees = self._dbp.get_degree_for_uris(candidates) embeddings.update(self._w2v.get_vectors(candidates)) # Filter candidates that have an embedding in w2v. nodes = sorted([(candidate, {'weight': degrees[candidate]}) for candidate in candidates if embeddings[candidate] is not None], key=lambda x: x[1]['weight'], reverse=True) # Take only the max_candidates most relevant (highest priors probability) candidates. nodes = nodes[:self._config.max_candidates] disambiguation_graph_col[col].add_nodes_from(nodes) sk_nodes_col[col][search_key] = [n[0] for n in nodes] # Store normalized priors weights_sum = sum([x[1]['weight'] for x in nodes]) for node, props in nodes: if node not in personalization: personalization[node] = [] personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0) for (row, lookup) in enumerate(lookup_results_row.values()): disambiguation_graph_row.append(nx.DiGraph()) sk_nodes_row.append(dict()) for search_key, candidates in lookup.items(): degrees = self._dbp.get_degree_for_uris(candidates) embeddings.update(self._w2v.get_vectors(candidates)) # Filter candidates that have an embedding in w2v. nodes = sorted([(candidate, {'weight': degrees[candidate]}) for candidate in candidates if embeddings[candidate] is not None], key=lambda x: x[1]['weight'], reverse=True) # Take only the max_candidates most relevant (highest priors probability) candidates. nodes = nodes[:self._config.max_candidates] disambiguation_graph_row[row].add_nodes_from(nodes) sk_nodes_row[row][search_key] = [n[0] for n in nodes] # Store normalized priors weights_sum = sum([x[1]['weight'] for x in nodes]) for node, props in nodes: if node not in personalization: personalization[node] = [] personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0) # Predict types using the classifier node_types = self._type_predictor.predict_types([node for sk_nodes in sk_nodes_col for nodes in list(sk_nodes.values()) for node in nodes]) # Predict types using the classifier node_types.update(self._type_predictor.predict_types([node for sk_nodes in sk_nodes_row for nodes in list(sk_nodes.values()) for node in nodes])) # Get type embeddings. Set is used to remove duplicate type_embeddings = self._tee.get_vectors(list({t for types in list(node_types.values()) for t in types})) # Add weighted edges among the nodes in the disambiguation graph. # Avoid to connect nodes in the same partition. # Weights of edges are the cosine similarity between the nodes which the edge is connected to. # Only positive weights are considered. for (col, k) in enumerate(sk_nodes_col): for search_key, nodes in k.items(): other_nodes = set(disambiguation_graph_col[col].nodes()) - set(nodes) for node, other_node in product(nodes, other_nodes): if type_embeddings[node_types[node][0]] is not None \ and type_embeddings[node_types[other_node][0]] is not None: v1 = type_embeddings[node_types[node][0]] v2 = type_embeddings[node_types[other_node][0]] cos_sim = cosine_similarity(v1, v2) if cos_sim > 0: disambiguation_graph_col[col].add_weighted_edges_from([(node, other_node, cos_sim)]) for (row, k) in enumerate(sk_nodes_row): for search_key, nodes in k.items(): other_nodes = set(disambiguation_graph_row[row].nodes()) - set(nodes) for node, other_node in product(nodes, other_nodes): v1 = embeddings[node] v2 = embeddings[other_node] cos_sim = cosine_similarity(v1, v2) if cos_sim > 0: disambiguation_graph_row[row].add_weighted_edges_from([(node, other_node, cos_sim)]) disambiguation_graph = nx.DiGraph() for col in disambiguation_graph_col: disambiguation_graph = nx.compose(disambiguation_graph, col) for row in disambiguation_graph_row: disambiguation_graph = nx.compose(disambiguation_graph, row) # Thin out a fraction of edges which weights are the lowest thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight"))) disambiguation_graph.remove_edges_from( sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out]) # Page rank computaton - epsilon is increased by a factor 2 until convergence page_rank = None epsilon = 1e-6 while page_rank is None: try: page_rank = nx.pagerank(disambiguation_graph, tol=epsilon, max_iter=50, alpha=0.9, personalization={node: np.mean(weights) for node, weights in personalization.items()}) except nx.PowerIterationFailedConvergence: epsilon *= 2 # lower factor can be used too since pagerank is extremely fast # Sort candidates -> the higher the score, the better the candidate (reverse=True) return [GeneratorResult(search_key, [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate]) for candidate in candidates], reverse=True)]) for (x, sk_nodes) in enumerate(sk_nodes_col) for search_key, candidates in sk_nodes.items()]