Esempio n. 1
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        This method annotates each table column separately, by finding which are the column types and
        the relationships between the current column and the other.
        :param table: a list of search_keys, which must belong to the same table column
        :return: a list of GeneratorResult
        """
        col_search_keys = {}
        for cell in table.get_gt_cells():
            if cell.col_id not in col_search_keys:
                col_search_keys[cell.col_id] = []
            col_search_keys[cell.col_id].append(table.get_search_key(cell))

        col_search_keys = {col: chunk_list(search_keys, 500) for col, search_keys in col_search_keys.items()}

        if self._config.max_workers == 1:
            results = [self._get_candidates_for_column(search_keys)
                       for search_keys_list in col_search_keys.values()
                       for search_keys in search_keys_list]
        else:
            with ProcessPoolExecutor(self._config.max_workers) as pool:
                results = pool.map(self._get_candidates_for_column,
                                   [search_keys for search_keys_list in col_search_keys.values()
                                    for search_keys in search_keys_list])
        return functools.reduce(operator.iconcat, results, [])
Esempio n. 2
0
    def _tables_to_pkl(self):
        cea = pd.read_csv(self._gt_path('CEA'),
                          names=['tab_id', 'col_id', 'row_id', 'entities'],
                          dtype={
                              'tab_id': str,
                              'col_id': int,
                              'row_id': int,
                              'entities': str
                          })
        cea['entities'] = cea['entities'].apply(str.split)
        cta_groups = None
        if os.path.exists(self._gt_path('CTA')):
            cta = pd.read_csv(
                self._gt_path('CTA'),
                names=['tab_id', 'col_id', 'perfect', 'okay'],
                dtype={
                    'tab_id': str,
                    'col_id': int,
                    'perfect': str,
                    'okay': str
                },
                keep_default_na=False)  # the "okay" value might be empty
            cta['perfect'] = cta['perfect'].apply(str.split)
            cta['okay'] = cta['okay'].apply(str.split)
            cta_groups = cta.groupby('tab_id')
        cpa_groups = None
        if os.path.exists(self._gt_path('CPA')):
            cpa = pd.read_csv(
                self._gt_path('CPA'),
                names=['tab_id', 'source_id', 'target_id', 'properties'],
                dtype={
                    'tab_id': str,
                    'source_id': int,
                    'target_id': int,
                    'properties': str
                })
            cpa['properties'] = cpa['properties'].apply(str.split)
            cpa_groups = cpa.groupby('tab_id')

        cea_groups = cea.groupby('tab_id')
        for tab_id, cea_group in cea_groups:
            table = Table(tab_id, self.value, self._table_path(tab_id))
            table.set_gt_cell_annotations(
                zip(cea_group['row_id'], cea_group['col_id'],
                    cea_group['entities']))
            if cta_groups and tab_id in cta_groups.groups:
                cta_group = cta_groups.get_group(tab_id)
                table.set_gt_column_annotations(
                    zip(cta_group['col_id'], cta_group['perfect'],
                        cta_group['okay']))
            if cpa_groups and tab_id in cpa_groups.groups:
                cpa_group = cpa_groups.get_group(tab_id)
                table.set_gt_property_annotations(
                    zip(cpa_group['source_id'], cpa_group['target_id'],
                        cpa_group['properties']))

            pickle.dump(
                table,
                open(f"{self._pickle_table_folder_path()}/{table.tab_id}.pkl",
                     'wb'))
Esempio n. 3
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        Candidate selection method. This implementation just forwards the LookupService results.
        :param table: a Table object
        :return: a list of GeneratorResult
        """
        search_keys = [table.get_search_key(cell_) for cell_ in table.get_gt_cells()]
        if self._config.max_workers == 1:
            results = self._lookup_candidates(search_keys)
        else:  # Parallelize at cell level (no dependencies between cells in the same col/row)
            with ProcessPoolExecutor(self._config.max_workers) as pool:
                results = pool.map(self._lookup_candidates, chunk_list(search_keys, self._config.chunk_size))

        return functools.reduce(operator.iconcat, results, [])
Esempio n. 4
0
    def annotate_table(self, table: Table):

        folder_path = os.path.join(os.path.dirname(__file__), 'annotations',
                                   table.dataset_id, self._generator.id)
        Path(folder_path).mkdir(parents=True, exist_ok=True)

        filename = os.path.join(folder_path, '%s.pkl' % table.tab_id)

        # check existing result
        if not os.path.exists(filename):

            # keep the cell-search_key pair -> results may be shuffled!
            search_key_cell_dict = table.get_search_keys_cells_dict()

            if isinstance(self._generator, HybridGenerator):
                # try to reuse already annotated tables
                tables = []
                for generator in self._generator.generators:
                    subfolder_path = os.path.join(os.path.dirname(__file__),
                                                  'annotations',
                                                  table.dataset_id,
                                                  generator.id)
                    subfilename = os.path.join(subfolder_path,
                                               '%s.pkl' % table.tab_id)

                    if not os.path.exists(subfilename):
                        break

                    tables.append(pickle.load(open(subfilename, 'rb')))

                if len(tables) == len(self._generator.generators):
                    results = HybridGeneratorSimulator.get_candidates(
                        *tables)  # combine results from many tables
                else:
                    results = self._generator.get_candidates(table)

            else:
                results = self._generator.get_candidates(table)

            for search_key, candidates in results:
                if candidates:
                    for cell in search_key_cell_dict[search_key]:
                        table.annotate_cell(cell, Entity(
                            candidates[0]))  # first candidate = best

            pickle.dump(table, open(filename, 'wb'))

            gc.collect()

        return pickle.load(open(filename, 'rb'))
Esempio n. 5
0
    def get_test_dataset(cls, size, from_dataset=None, rand=False):
        """
        Helper method to generate a test dataset on-the-fly.
        :param size: dimension of the test dataset to create (# cells)
        :param from_dataset: dataset to sample rows from. Default: Round1
        :param rand: True if the rows should be sampled randomly; otherwise, the top ``size`` rows are returned.
        :return: a Pandas dataframe
        """
        if from_dataset is None:
            from_dataset = cls.ST19_Round1
        cea = pd.read_csv(from_dataset._gt_path('CEA'),
                          names=['tab_id', 'col_id', 'row_id', 'entities'],
                          dtype={
                              'tab_id': str,
                              'col_id': int,
                              'row_id': int,
                              'entities': str
                          })
        if rand:
            cea = cea.sample(size).reset_index()
        else:
            cea = cea[:size]

        cta_groups = None
        if os.path.exists(from_dataset._gt_path('CTA')):
            cta = pd.read_csv(
                from_dataset._gt_path('CTA'),
                names=['tab_id', 'col_id', 'perfect', 'okay'],
                dtype={
                    'tab_id': str,
                    'col_id': int,
                    'perfect': str,
                    'okay': str
                },
                keep_default_na=False)  # the "okay" value might be empty
            cta['perfect'] = cta['perfect'].apply(str.split)
            cta['okay'] = cta['okay'].apply(str.split)
            cta_groups = cta.groupby('tab_id')
        cpa_groups = None
        if os.path.exists(from_dataset._gt_path('CPA')):
            cpa = pd.read_csv(
                from_dataset._gt_path('CPA'),
                names=['tab_id', 'source_id', 'target_id', 'properties'],
                dtype={
                    'tab_id': str,
                    'source_id': int,
                    'target_id': int,
                    'properties': str
                })
            cpa['properties'] = cpa['properties'].apply(str.split)
            cpa_groups = cpa.groupby('tab_id')

        cea_groups = cea.groupby('tab_id')
        tables = []
        for tab_id, cea_group in cea_groups:
            table = Table(tab_id, f'{from_dataset.value}_test',
                          from_dataset._table_path(tab_id))
            table.set_gt_cell_annotations(
                zip(cea_group['row_id'], cea_group['col_id'],
                    cea_group['entities']))
            if cta_groups and tab_id in cta_groups.groups:
                cta_group = cta_groups.get_group(tab_id)
                cta_group = cta_group[cta_group['col_id'].isin(
                    cea_group['col_id'].unique())]
                table.set_gt_column_annotations(
                    zip(cta_group['col_id'], cta_group['perfect'],
                        cta_group['okay']))
            if cpa_groups and tab_id in cpa_groups.groups:
                cpa_group = cpa_groups.get_group(tab_id)
                cpa_group = cpa_group[
                    (cpa_group['source_id'].isin(cea_group['col_id'].unique()))
                    & (cpa_group['target_id'].isin(
                        cea_group['col_id'].unique()))]
                table.set_gt_property_annotations(
                    zip(cpa_group['source_id'], cpa_group['target_id'],
                        cpa_group['properties']))

            tables.append(table)

        tmp = Enum('GTTestEnum',
                   {'%s_TEST_%d' % (from_dataset.name, size): tables
                    })  # create a temp enum
        setattr(tmp, 'get_tables', lambda x: x.value
                )  # add the get_df function, that returns the tables
        setattr(tmp, 'get_table_categories',
                lambda x: from_dataset.get_table_categories())
        setattr(tmp, 'total_tables', lambda x: len(tables))
        return list(tmp)[0]
Esempio n. 6
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        """
        Return a list of candidates, sorted by the cosine distance between their label and context embeddings.
        :param table: a Table object
        :return: a list of GeneratorResult
        """
        search_keys = [
            table.get_search_key(cell_) for cell_ in table.get_gt_cells()
        ]
        lookup_results = dict(self._lookup_candidates(
            search_keys))  # collect lookup result from the super class

        # create embed for each label and context pair
        cached_entries, to_compute = self._get_cached_entries(search_keys)
        new_results = self._embed_search_keys(to_compute)
        self._update_cache(new_results)  # write new entries to cache

        search_keys_embs = dict(cached_entries + new_results)

        # create embed for the candidates' abstracts
        candidates_list = functools.reduce(operator.iconcat,
                                           lookup_results.values(), [])
        if self._config.abstract == 'short':
            abstracts = self._abstract_helper.fetch_short_abstracts(
                candidates_list)
        else:
            abstracts = self._abstract_helper.fetch_long_abstracts(
                candidates_list)
        abstracts = {
            candidate: truncate_string(abstract,
                                       self._config.abstract_max_tokens)
            for candidate, abstract in abstracts.items()
        }

        cached_entries, to_compute = self._get_cached_entries(
            abstracts.values())
        new_results = self._embed_abstracts(to_compute)
        self._update_cache(new_results)
        abstracts_embeddings = dict(cached_entries + new_results)

        # do not zip! abstracts.values() might contain duplicates...
        abstracts_embs = {
            candidate: abstracts_embeddings[abstract]
            for candidate, abstract in abstracts.items()
        }

        results = []
        for search_key in search_keys:
            candidates_embeddings = []
            context_emb = np.nan
            if search_key.context and search_keys_embs[search_key].size:
                context_emb = search_keys_embs[search_key]
            for candidate in lookup_results[search_key]:
                abstract_emb = np.nan
                if candidate in abstracts and abstracts_embs[candidate].size:
                    abstract_emb = abstracts_embs[candidate]
                candidates_embeddings.append(
                    CandidateEmbeddings(candidate, context_emb, abstract_emb))

            results.append(
                GeneratorResult(search_key, [
                    c.candidate for c in weighting_by_ranking(
                        candidates_embeddings, self._config.alpha,
                        self._config.default_score)
                ]))

        return results
Esempio n. 7
0
    def get_candidates(self, table: Table) -> List[GeneratorResult]:
        col_search_keys = {}
        row_search_keys = {}
        for cell in table.get_gt_cells():
            if cell.col_id not in col_search_keys:
                col_search_keys[cell.col_id] = []
            if cell.row_id not in row_search_keys:
                row_search_keys[cell.row_id] = []
            col_search_keys[cell.col_id].append(table.get_search_key(cell))
            row_search_keys[cell.row_id].append(table.get_search_key(cell))

        lookup_results_col = {}
        for col, search_key in col_search_keys.items():
            lookup_results_col[col] = dict(self._lookup_candidates(search_key))
        lookup_results_row = {}
        for row, search_key in row_search_keys.items():
            lookup_results_row[row] = dict(self._lookup_candidates(search_key))

        # Create a complete directed k-partite disambiguation graph where k is the number of search keys.
        disambiguation_graph_col = []
        disambiguation_graph_row = []
        sk_nodes_col = []
        sk_nodes_row = []
        personalization = {}  # prepare dict for pagerank with normalized priors
        embeddings = {}

        for (col, lookup) in enumerate(lookup_results_col.values()):
            disambiguation_graph_col.append(nx.DiGraph())
            sk_nodes_col.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_col[col].add_nodes_from(nodes)
                sk_nodes_col[col][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        for (row, lookup) in enumerate(lookup_results_row.values()):
            disambiguation_graph_row.append(nx.DiGraph())
            sk_nodes_row.append(dict())
            for search_key, candidates in lookup.items():
                degrees = self._dbp.get_degree_for_uris(candidates)
                embeddings.update(self._w2v.get_vectors(candidates))

                # Filter candidates that have an embedding in w2v.
                nodes = sorted([(candidate, {'weight': degrees[candidate]})
                                for candidate in candidates
                                if embeddings[candidate] is not None],
                               key=lambda x: x[1]['weight'], reverse=True)

                # Take only the max_candidates most relevant (highest priors probability) candidates.
                nodes = nodes[:self._config.max_candidates]
                disambiguation_graph_row[row].add_nodes_from(nodes)
                sk_nodes_row[row][search_key] = [n[0] for n in nodes]

                # Store normalized priors
                weights_sum = sum([x[1]['weight'] for x in nodes])
                for node, props in nodes:
                    if node not in personalization:
                        personalization[node] = []
                    personalization[node].append(props['weight'] / weights_sum if weights_sum > 0 else 0)

        # Predict types using the classifier
        node_types = self._type_predictor.predict_types([node for sk_nodes in sk_nodes_col
                                                         for nodes in list(sk_nodes.values())
                                                         for node in nodes])
        # Predict types using the classifier
        node_types.update(self._type_predictor.predict_types([node for sk_nodes in sk_nodes_row
                                                              for nodes in list(sk_nodes.values())
                                                              for node in nodes]))
        # Get type embeddings. Set is used to remove duplicate
        type_embeddings = self._tee.get_vectors(list({t for types in list(node_types.values()) for t in types}))

        # Add weighted edges among the nodes in the disambiguation graph.
        # Avoid to connect nodes in the same partition.
        # Weights of edges are the cosine similarity between the nodes which the edge is connected to.
        # Only positive weights are considered.
        for (col, k) in enumerate(sk_nodes_col):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_col[col].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    if type_embeddings[node_types[node][0]] is not None \
                            and type_embeddings[node_types[other_node][0]] is not None:
                        v1 = type_embeddings[node_types[node][0]]
                        v2 = type_embeddings[node_types[other_node][0]]
                        cos_sim = cosine_similarity(v1, v2)
                        if cos_sim > 0:
                            disambiguation_graph_col[col].add_weighted_edges_from([(node, other_node, cos_sim)])

        for (row, k) in enumerate(sk_nodes_row):
            for search_key, nodes in k.items():
                other_nodes = set(disambiguation_graph_row[row].nodes()) - set(nodes)
                for node, other_node in product(nodes, other_nodes):
                    v1 = embeddings[node]
                    v2 = embeddings[other_node]
                    cos_sim = cosine_similarity(v1, v2)
                    if cos_sim > 0:
                        disambiguation_graph_row[row].add_weighted_edges_from([(node, other_node, cos_sim)])

        disambiguation_graph = nx.DiGraph()
        for col in disambiguation_graph_col:
            disambiguation_graph = nx.compose(disambiguation_graph, col)
        for row in disambiguation_graph_row:
            disambiguation_graph = nx.compose(disambiguation_graph, row)

        # Thin out a fraction of edges which weights are the lowest
        thin_out = int(self._config.thin_out_frac * len(disambiguation_graph.edges.data("weight")))
        disambiguation_graph.remove_edges_from(
            sorted(disambiguation_graph.edges.data("weight"), key=lambda tup: tup[2])[:thin_out])

        # Page rank computaton - epsilon is increased by a factor 2 until convergence
        page_rank = None
        epsilon = 1e-6
        while page_rank is None:
            try:
                page_rank = nx.pagerank(disambiguation_graph,
                                        tol=epsilon, max_iter=50, alpha=0.9,
                                        personalization={node: np.mean(weights)
                                                         for node, weights in personalization.items()})

            except nx.PowerIterationFailedConvergence:
                epsilon *= 2  # lower factor can be used too since pagerank is extremely fast

        # Sort candidates -> the higher the score, the better the candidate (reverse=True)
        return [GeneratorResult(search_key,
                                [c.candidate for c in sorted([ScoredCandidate(candidate, page_rank[candidate])
                                                              for candidate in candidates],
                                                             reverse=True)])
                for (x, sk_nodes) in enumerate(sk_nodes_col)
                for search_key, candidates in sk_nodes.items()]