def search_loose(label, relation, value): """ Execute a search operation on a given label allowing a big margin of edit distance (Levenshtein), restricting the results to have in their facts at least one of the relation:value :param label: the label to look for :param relation: a relation :param value: a value :return: a list of results """ eslookup = ESLookupFuzzy(ESLookupConfig('titan', 'dbpedia')) abc = DBpediaWrapper() results = eslookup._lookup(labels=[label])[0][1] removeList = [] for res in results: relations = abc.get_relations([(res, value)]) remove = True for pair, col_relations in relations.items(): if relation in col_relations: remove = False if remove: removeList.append(res) for rem in removeList: results.remove(rem) return results
def getDescriptionTokens(uri): """ Get the description tokens' of the given URI :param uri: the URI to be read :return: a list of keywords """ abc = DBpediaWrapper() tokenizer = nltk.RegexpTokenizer(r"\w+") result = abc._get_es_docs_by_ids(uri) stop_words = set(stopwords.words('english')) for _, doc in result: if len(doc['description']) > 0: word = simplify_string(doc['description'][0], dates=False, numbers=False, single_char=False, brackets=True) word = first_sentence(word) if word is not None: word_tokens = tokenizer.tokenize(word.lower()) return [word for word in word_tokens if word not in stop_words] return []
def containsFact(uri, a, v): """ In a given URI, retrieve all the relations which its value it is v :param uri: the URI to be read :param a: the relation's name :param v: the value of the relation a :return: a list of tuples (a, relation found) """ abc = DBpediaWrapper() toRemove = [ 'http://dbpedia.org/ontology/abstract', 'http://dbpedia.org/ontology/wikiPageWikiLink', 'http://www.w3.org/2000/01/rdf-schema#comment', 'http://purl.org/dc/terms/subject', 'http://www.w3.org/2000/01/rdf-schema#label', 'http://www.w3.org/2002/07/owl#Thing' ] relations = abc.get_relations([(uri, v)]) candidateRelations = [] for pair, col_relations in relations.items(): for rel in col_relations: if rel not in toRemove: candidateRelations.append((a, rel)) return candidateRelations
def __init__(self, *lookup_services: LookupService, config: EmbeddingOnGraphConfig = EmbeddingOnGraphConfig(max_subseq_len=0, max_candidates=8, thin_out_frac=0.25)): super().__init__(*lookup_services, config=config) self._dbp = DBpediaWrapper() self._w2v = RDF2Vec()
def __init__(self, *lookup_services: LookupService, config: EmbeddingCandidateGeneratorConfig): super().__init__(*lookup_services, config=config) self._abstract_helper = DBpediaWrapper() self._cache = Cache(os.path.join(os.path.dirname(__file__), '.cache', self.__class__.__name__, self._config.cache_dir()), size_limit=int(8e9))
def getTypes(uri): """ Get the types' of the given URI :param uri: the URI to be read :return: a list of types """ abc = DBpediaWrapper() toRemove = ['http://www.w3.org/2002/07/owl#Thing'] result = abc._get_es_docs_by_ids(uri) types = [] for _, doc in result: for x in doc['type']: if x not in toRemove: types.append(x) return types
def annotate_table(table): # for table in tables: # count += index # index = 0 table_filename = f"./gianluca/{table.tab_id}.pkl" if not os.path.exists(table_filename): abc = DBpediaWrapper() dblookup = DBLookup() allTypes, descTokens, candidateRelations, relations = [], [], [], [] firstResult = {} search_key_cell_dict = { table.get_search_key(cell_): cell_ for cell_ in table.get_gt_cells() } for search_key, cell in search_key_cell_dict.items(): label = search_key.label # labelList.append(label) results = dblookup._lookup( labels=[label])[0][1] # all URIs of a given label if len(results) > 0: topResult = results[0] firstResult[cell] = results[0] allTypes += getTypes(topResult) descTokens += getDescriptionTokens(topResult) if len(results) == 1: table.annotate_cell(cell, Entity(topResult)) for a, v in search_key.context: # for a, v in referenceColumns[row][index].items(): candidateRelations += containsFact(topResult, a, v) # else: # firstResult.append("not annotated") # # index += 1 acceptableTypes = getMostFrequent(allTypes, n=5) descriptionTokens = getMostFrequent(descTokens) candidateRelations = atLeast5(candidateRelations) # for attr in referenceColumns[row][index - 1]: for attr in table.get_search_key(table.get_gt_cells()[0]): relations += getFirst(candidateRelations, attr) # index = 0 # for label in labelColumn[row]: for search_key, cell in search_key_cell_dict.items(): # if annotationList[count + index] != "not annotated": if search_key in table.cell_annotations: # index += 1 continue label = search_key.label results = search_strict(label=label, types=acceptableTypes, description=descriptionTokens) if len(results) > 0: topResult = results[0] table.annotate_cell(cell, Entity(topResult)) # annotationList[count + index] = topResult # index += 1 continue for r in relations: # results = search_loose(label=label, relation=r[1], value=referenceColumns[row][index][r[0]]) results = search_loose(label=label, relation=r[1], value=search_key.context[r[0]]) if len(results) > 0: topResult = results[0] # annotationList[count + index] = topResult table.annotate_cell(cell, Entity(topResult)) break # if annotationList[count + index] == "not annotated" and firstResult[index] != "not annotated": if search_key not in table.cell_annotations and cell in firstResult: label_ = abc.get_labels_for_uris([firstResult[cell] ])[firstResult[cell]] if len(label_) > 0: if label_[0] == label: table.annotate_cell(cell, Entity(firstResult[cell])) # index += 1 pickle.dump(table, open(table_filename, 'wb')) return pickle.load(open(table_filename, 'rb'))
class EmbeddingCandidateGenerator(CandidateGenerator): """ Abstract generator that re-rank candidates accordingly with vector similarities. For each candidate, both the abstract and label embeddings are computed and then compared using the cosine distance measure. """ def __init__(self, *lookup_services: LookupService, config: EmbeddingCandidateGeneratorConfig): super().__init__(*lookup_services, config=config) self._abstract_helper = DBpediaWrapper() self._cache = Cache(os.path.join(os.path.dirname(__file__), '.cache', self.__class__.__name__, self._config.cache_dir()), size_limit=int(8e9)) def _embed_search_keys(self, search_keys: List[SearchKey]) -> List[Embedding]: """ Abstract method to compute search keys embeddings. :param search_keys: the list of SearchKey to embed :return: a list of embeddings """ raise NotImplementedError def _embed_abstracts(self, abstracts: List[str]) -> List[Embedding]: """ Abstract method to compute abstracts embeddings. :param abstracts: the list of abstracts to embed :return: a list of embeddings """ raise NotImplementedError def _update_cache(self, embeddings: List[Embedding]): """ Update cache entries with new embeddings :param embeddings: a list of Embedding :return: """ for embedding in embeddings: self._cache.set(embedding.key, embedding) # ALWAYS override! def _get_cached_entries( self, keys: List[Union[str, SearchKey]] ) -> Tuple[List[Embedding], List[Union[str, SearchKey]]]: """ Retrieve already computed embeddings from cache :param keys: a list of keys to retrieve :return: a tuple (<cached results>, <labels to embed>) """ to_compute = [] cached_entries = [] for key in keys: entry = self._cache.get(key) if entry is None: to_compute.append(key) else: cached_entries.append(entry) return cached_entries, to_compute def get_candidates(self, table: Table) -> List[GeneratorResult]: """ Return a list of candidates, sorted by the cosine distance between their label and context embeddings. :param table: a Table object :return: a list of GeneratorResult """ search_keys = [ table.get_search_key(cell_) for cell_ in table.get_gt_cells() ] lookup_results = dict(self._lookup_candidates( search_keys)) # collect lookup result from the super class # create embed for each label and context pair cached_entries, to_compute = self._get_cached_entries(search_keys) new_results = self._embed_search_keys(to_compute) self._update_cache(new_results) # write new entries to cache search_keys_embs = dict(cached_entries + new_results) # create embed for the candidates' abstracts candidates_list = functools.reduce(operator.iconcat, lookup_results.values(), []) if self._config.abstract == 'short': abstracts = self._abstract_helper.fetch_short_abstracts( candidates_list) else: abstracts = self._abstract_helper.fetch_long_abstracts( candidates_list) abstracts = { candidate: truncate_string(abstract, self._config.abstract_max_tokens) for candidate, abstract in abstracts.items() } cached_entries, to_compute = self._get_cached_entries( abstracts.values()) new_results = self._embed_abstracts(to_compute) self._update_cache(new_results) abstracts_embeddings = dict(cached_entries + new_results) # do not zip! abstracts.values() might contain duplicates... abstracts_embs = { candidate: abstracts_embeddings[abstract] for candidate, abstract in abstracts.items() } results = [] for search_key in search_keys: candidates_embeddings = [] context_emb = np.nan if search_key.context and search_keys_embs[search_key].size: context_emb = search_keys_embs[search_key] for candidate in lookup_results[search_key]: abstract_emb = np.nan if candidate in abstracts and abstracts_embs[candidate].size: abstract_emb = abstracts_embs[candidate] candidates_embeddings.append( CandidateEmbeddings(candidate, context_emb, abstract_emb)) results.append( GeneratorResult(search_key, [ c.candidate for c in weighting_by_ranking( candidates_embeddings, self._config.alpha, self._config.default_score) ])) return results
def __init__(self, *lookup_services: LookupService, config: FactBaseConfig): super().__init__(*lookup_services, config=config) self._dbp = DBpediaWrapper() self._stats = FactBaseStats(self.__class__.__name__)
class FactBase(CandidateGenerator): """ Candidate generation method that implements the FactBase lookup [Efthymiou+, 2017]. """ def __init__(self, *lookup_services: LookupService, config: FactBaseConfig): super().__init__(*lookup_services, config=config) self._dbp = DBpediaWrapper() self._stats = FactBaseStats(self.__class__.__name__) def _get_descriptions_tokens(self, uris: List[str]) -> Dict[str, List[str]]: """ Get the tokens of descriptions of a given list of entities :param uris: a list of URIs :return: a dict uri: tokens """ tokens = {} for uri, descriptions in self._dbp.get_descriptions_for_uris(uris).items(): if descriptions: desc = simplify_string(descriptions[0], dates=False, numbers=False, single_char=False, brackets=True) short_desc = first_sentence(desc) if short_desc: tokens[uri] = tokenize(short_desc) else: tokens[uri] = [] else: tokens[uri] = [] return tokens def _contains_facts(self, facts: Dict[Any, List[Tuple[str, str]]], min_occurrences: int): """ Return a dict with a list of properties sorted by fact occurrences. Properties with less than `min_occurrences` are filtered out. :param facts: a dict {id: [uri-literal pairs]} :param min_occurrences: minimum number of occurrences :return: a dict {id: [list of prop-#occurrences]} """ relations = {} for col, pairs in facts.items(): all_col_relations = [] for pair, col_relations in self._dbp.get_relations(pairs).items(): all_col_relations += col_relations relations[col] = sorted([(rel, count) for rel, count in Counter(all_col_relations).items() if count >= min_occurrences], key=lambda item: item[1], reverse=True) return relations def _search_strict(self, candidates: List[str], acceptable_types: List[str], types: Dict[str, List[str]], acceptable_tokens: List[str], description_tokens: Dict[str, List[str]]) -> List[str]: """ Execute a search operation on a given label restricting the results to those of an acceptable type, having one of the most frequent tokens in their description values. If the acceptable types (or tokens) are not given, all types (tokens) are considered as valid. :param candidates: a list of candidates from LookupResult :param acceptable_types: a list of acceptable types :param types: a dict uri: types :param acceptable_tokens: a list of acceptable tokens :param description_tokens: a dict uri: tokens :return: a list of candidates """ refined_candidates = [] acceptable_types_set = set(acceptable_types) acceptable_tokens_set = set(acceptable_tokens) for candidate in candidates: type_hit = True if acceptable_types: type_hit = set(types[candidate]) & acceptable_types_set token_hit = True if acceptable_tokens: token_hit = set(description_tokens[candidate]) & acceptable_tokens_set if type_hit and token_hit: refined_candidates.append(candidate) # preserve ordering return refined_candidates def _search_loose(self, label: str, relation: str, value: str, threshold: float = -1) -> List[str]: """ Execute a fuzzy search (Levenshtein) and get the results for which there exist a fact <result, relation, value>. Return the subject with the minimal edit distance from label. :param label: the label to look for :param relation: a relation :param value: a value :param threshold: max edit distance to consider. Set a negative value to ignore it :return: a list of results """ candidates = [] for candidate, c_labels in self._dbp.get_subjects(relation, value).items(): scores = sorted( [(candidate, edit_distance(label, c_label) / max(len(label), len(c_label))) for c_label in c_labels], key=lambda s: s[1]) if scores: # keep the best label for each candidate if threshold < 0: candidates.append(scores[0]) elif scores[0][1] <= threshold: candidates.append(scores[0]) return [c[0] for c in sorted(candidates, key=lambda s: s[1])] # sort by edit distance def _get_candidates_for_column(self, search_keys: List[SearchKey]) -> List[GeneratorResult]: """ Generate candidate for a set of search keys. The assumption is that all the search keys belong to the same column. :param search_keys: a list of search_keys :return: """ lookup_results = dict(self._lookup_candidates(search_keys)) generator_results = {} # Pre-fetch types and description of the top candidate of each candidates set candidates_set = list({candidates[0] for candidates in lookup_results.values() if candidates}) types = functools.reduce(operator.iconcat, self._dbp.get_direct_types_for_uris(candidates_set).values(), []) description_tokens = functools.reduce(operator.iconcat, self._get_descriptions_tokens(candidates_set).values(), []) facts = {} # dict of possible facts in table (fact := <top_concept, ?p, support_col_value>) # First scan - raw results for search_key, candidates in lookup_results.items(): if candidates: # Handle cells with some candidates (higher confidence) if len(candidates) == 1: generator_results[search_key] = GeneratorResult(search_key, candidates) # Check for relationships if there is only one candidate (very high confidence) for col_id, col_value in search_key.context: if col_id not in facts: facts[col_id] = [] facts[col_id].append((candidates[0], col_value)) self._stats.incr_exact() acceptable_types = get_most_frequent(types, n=5) acceptable_tokens = get_most_frequent(description_tokens) relations = {col_id: candidate_relations[0][0] for col_id, candidate_relations in self._contains_facts(facts, min_occurrences=5).items() if candidate_relations} # Second scan - refinement and loose searches for search_key, candidates in lookup_results.items(): # Skip already annotated cells if search_key in generator_results: continue if candidates: # Pre-fetch types and description of all the candidates of not annotated cells types = self._dbp.get_direct_types_for_uris(candidates) description_tokens = self._get_descriptions_tokens(candidates) # Strict search: filter lists of candidates by removing entities that do not match types and tokens refined_candidates = self._search_strict(candidates, acceptable_types, types, acceptable_tokens, description_tokens) if refined_candidates: generator_results[search_key] = GeneratorResult(search_key, refined_candidates) self._stats.incr_strict() continue # Loose search: increase the recall by allowing a big margin of edit distance (Levenshtein) context_dict = dict(search_key.context) for col_id, relation in relations.items(): refined_candidates = self._search_loose(search_key.label, relation, context_dict[col_id]) if len(refined_candidates) > 0: generator_results[search_key] = GeneratorResult(search_key, refined_candidates) self._stats.incr_loose() break # Coarse- and fine-grained searches failed: no results if search_key not in generator_results: generator_results[search_key] = GeneratorResult(search_key, []) self._stats.incr_empty() return list(generator_results.values()) def get_candidates(self, table: Table) -> List[GeneratorResult]: """ This method annotates each table column separately, by finding which are the column types and the relationships between the current column and the other. :param table: a list of search_keys, which must belong to the same table column :return: a list of GeneratorResult """ self._stats.init(table.dataset_id, table.tab_id) col_search_keys = {} for cell in table.get_gt_cells(): if cell.col_id not in col_search_keys: col_search_keys[cell.col_id] = [] col_search_keys[cell.col_id].append(table.get_search_key(cell)) if self._config.max_workers == 1: results = [self._get_candidates_for_column(search_keys) for search_keys in col_search_keys.values()] else: with ProcessPoolExecutor(self._config.max_workers) as pool: results = pool.map(self._get_candidates_for_column, col_search_keys.values()) return functools.reduce(operator.iconcat, results, [])
class EmbeddingOnGraph(CandidateGenerator): def __init__(self, *lookup_services: LookupService, config: EmbeddingOnGraphConfig = EmbeddingOnGraphConfig(max_subseq_len=0, max_candidates=8, thin_out_frac=0.25)): super().__init__(*lookup_services, config=config) self._dbp = DBpediaWrapper() self._w2v = RDF2Vec() def _get_candidates_for_column(self, search_keys: List[SearchKey]) -> List[GeneratorResult]: lookup_results = dict(self._lookup_candidates(search_keys)) # Create a complete directed k-partite disambiguation graph where k is the number of search keys. disambiguation_graph = nx.DiGraph() sk_nodes = {} personalization = {} # prepare dict for pagerank with normalized priors embeddings = {} for search_key, candidates in lookup_results.items(): degrees = self._dbp.get_degree_for_uris(candidates) embeddings.update(self._w2v.get_vectors(candidates)) # Filter candidates that have an embedding in w2v. nodes = sorted([(candidate, {'weight': degrees[candidate]}) for candidate in candidates if embeddings[candidate] is not None], key=lambda x: x[1]['weight'], reverse=True) # Take only the max_candidates most relevant (highest priors probability) candidates. nodes = nodes[:self._config.max_candidates] disambiguation_graph.add_nodes_from(nodes) sk_nodes[search_key] = [n[0] for n in nodes] # Store normalized priors weights_sum = sum([x[1]['weight'] for x in nodes]) for node, props in nodes: if node not in personalization: personalization[node] = [] personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0) # Add weighted edges among the nodes in the disambiguation graph. # Avoid to connect nodes in the same partition. # Weights of edges are the cosine similarity between the nodes which the edge is connected to. # Only positive weights are considered. for search_key, nodes in sk_nodes.items(): other_nodes = set(disambiguation_graph.nodes()) - set(nodes) for node, other_node in product(nodes, other_nodes): v1 = embeddings[node] v2 = embeddings[other_node] cos_sim = cosine_similarity(v1, v2) if cos_sim > 0: disambiguation_graph.add_weighted_edges_from([(node, other_node, cos_sim)]) # Thin out a fraction of edges which weights are the lowest thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight"))) disambiguation_graph.remove_edges_from( sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out]) # Page rank computaton - epsilon is increased by a factor 2 until convergence page_rank = None epsilon = 1e-6 while page_rank is None: try: page_rank = nx.pagerank(disambiguation_graph, tol=epsilon, max_iter=50, alpha=0.9, personalization={node: np.mean(weights) for node, weights in personalization.items()}) except nx.PowerIterationFailedConvergence: epsilon *= 2 # lower factor can be used too since pagerank is extremely fast # Sort candidates -> the higher the score, the better the candidate (reverse=True) return [GeneratorResult(search_key, [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate]) for candidate in candidates], reverse=True)]) for search_key, candidates in sk_nodes.items()] def get_candidates(self, table: Table) -> List[GeneratorResult]: """ This method annotates each table column separately, by finding which are the column types and the relationships between the current column and the other. :param table: a list of search_keys, which must belong to the same table column :return: a list of GeneratorResult """ col_search_keys = {} for cell in table.get_gt_cells(): if cell.col_id not in col_search_keys: col_search_keys[cell.col_id] = [] col_search_keys[cell.col_id].append(table.get_search_key(cell)) col_search_keys = {col: chunk_list(search_keys, 500) for col, search_keys in col_search_keys.items()} if self._config.max_workers == 1: results = [self._get_candidates_for_column(search_keys) for search_keys_list in col_search_keys.values() for search_keys in search_keys_list] else: with ProcessPoolExecutor(self._config.max_workers) as pool: results = pool.map(self._get_candidates_for_column, [search_keys for search_keys_list in col_search_keys.values() for search_keys in search_keys_list]) return functools.reduce(operator.iconcat, results, [])