Beispiel #1
0
    def compute_scores_for_inference(self, clusters_mx, per_example_negs):
        # TODO: add description here
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        # create dataset and dataloader
        dataset = InferenceEmbeddingDataset(args, examples,
                                            args.train_cache_dir)
        dataloader = InferenceEmbeddingDataLoader(args, dataset)

        # get the unique idxs and embeds for each idx
        idxs, embeds = self.get_embeddings(dataloader, evaluate=False)

        sparse_graph = None
        if get_rank() == 0:
            # create inverse index for mapping
            inverse_idxs = {v: k for k, v in enumerate(idxs)}

            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [
                np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]])
                for i, j in edges
            ]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
Beispiel #2
0
    async def _set_actions(self, ctx, query, flags, *, color, default_op):
        flags = unique(flags)
        reduced = reduce(operator.or_, flags)

        default = default_op(reduced)
        channel_id, events = await ctx.db.fetchrow(query, ctx.guild.id, reduced.value, default)

        enabled_flags = ', '.join(flag.name for flag in ActionFlag if events & flag)

        embed = (discord.Embed(description=', '.join(flag.name for flag in ActionFlag), color=color)
                 .set_author(name=f'Successfully {ctx.command.name}d the following actions')
                 .add_field(name='The following mod actions will now be logged:', value=enabled_flags, inline=False))

        await self._check_modlog_channel(ctx, channel_id, embed=embed)
Beispiel #3
0
    def create_val_dataloader(self):
        args = self.args

        # load and cache examples and get the metadata for the dataset
        self.load_and_cache_examples(split='val', evaluate=True)

        if args.available_entities in ['candidates_only', 'knn_candidates']:
            examples = flatten(
                [[k] + v for k, v in self.val_metadata.midx2cand.items()])
        elif args.available_entities == 'open_domain':
            examples = list(self.val_metadata.idx2uid.keys())
        else:
            raise ValueError('Invalid available_entities')
        examples = unique(examples)
        self.val_dataset = InferenceEmbeddingDataset(args, examples,
                                                     args.val_cache_dir)
        self.val_dataloader = InferenceEmbeddingDataLoader(
            args, self.val_dataset)
Beispiel #4
0
    def _build_temp_sparse_graph(self, clusters_mx, per_example_negs):
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        sparse_graph = None
        if get_rank() == 0:
            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [0.0 for i, j in edges]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
Beispiel #5
0
    async def _bulk_set_permissions(connection, guild_id, name, *entities,
                                    whitelist):
        ids = tuple(unique(entity.id for entity in entities))

        # F**k this
        query = """
            DELETE FROM permissions
            WHERE       guild_id = $1 AND name = $2 AND snowflake = ANY($3::BIGINT[]);
        """
        await connection.execute(query, guild_id, name, ids)

        if not whitelist:
            # Permissions shall not be created during a reset
            return

        columns = ('guild_id', 'name', 'snowflake', 'whitelist')
        to_insert = [(guild_id, name, id, whitelist) for id in ids]

        await connection.copy_records_to_table('permissions',
                                               columns=columns,
                                               records=to_insert)
Beispiel #6
0
def build_sparse_affinity_graph(args,
                                midxs,
                                example_dir,
                                metadata,
                                knn_index,
                                sub_trainer,
                                build_coref_graph=False,
                                build_linking_graph=False):

    assert build_coref_graph or build_linking_graph

    coref_graph = None
    linking_graph = None
    if get_rank() == 0:
        mention_knn = None
        if build_coref_graph or args.available_entities == 'knn_candidates':
            ## get all of the mention kNN
            #mention_knn = knn_index.get_knn_limited_index(
            #        midxs, include_index_idxs=midxs, k=args.k+1
            #)
            #mention_knn = mention_knn[:,1:]
            midx2doc = {}
            doc2midx = defaultdict(list)
            for doc_id, wdoc_clusters in metadata.wdoc_clusters.items():
                doc2midx[doc_id] = flatten(list(wdoc_clusters.values()))
                for midx in doc2midx[doc_id]:
                    midx2doc[midx] = doc_id
            mention_knn = []
            for midx in midxs:
                mention_knn.append([
                    x for x in doc2midx[midx2doc[midx]]
                    if x != midx and x >= args.num_entities
                ])

    if build_coref_graph:
        # list of edges for sparse graph we will build
        coref_graph_edges = []
        if get_rank() == 0:
            # add mention-mention edges to list
            coref_graph_edges.extend([
                tuple(sorted((a, b))) for a, l in zip(midxs, mention_knn)
                for b in l if a != b
            ])
            coref_graph_edges = unique(coref_graph_edges)

        # broadcast edges to all processes to compute affinities
        coref_graph_edges = broadcast(coref_graph_edges, src=0)
        affinities = sub_trainer.get_edge_affinities(coref_graph_edges,
                                                     example_dir, knn_index)

        # affinities are gathered to only rank 0 process
        if get_rank() == 0:
            # build the graph
            coref_graph_edges = np.asarray(coref_graph_edges).T
            _sparse_num = metadata.num_mentions + metadata.num_entities
            coref_graph = coo_matrix((affinities, coref_graph_edges),
                                     shape=(_sparse_num, _sparse_num))

    if build_linking_graph:
        # list of edges for sparse graph we will build
        linking_graph_edges = []
        if get_rank() == 0:
            # get mention-entity edges
            if args.available_entities == 'candidates_only':
                for midx in midxs:
                    candidates = metadata.midx2cand.get(midx, [])
                    if len(candidates) > 0:
                        linking_graph_edges.extend([
                            tuple(sorted((midx, eidx))) for eidx in candidates
                        ])
            elif args.available_entities == 'knn_candidates':
                # get all of the mention kNN
                if args.clustering_domain == 'within_doc':
                    for midx in midxs:
                        candidates = metadata.midx2cand.get(midx, [])
                        if len(candidates) > 0:
                            linking_graph_edges.extend([
                                tuple(sorted((midx, eidx)))
                                for eidx in candidates
                            ])
                elif args.clustering_domain == 'cross_doc':
                    raise NotImplementedError('unsupported clustering_domain')
                else:
                    raise ValueError('unsupported clustering_domain')
            else:  # 'open_domain'
                # get all of the mention kNN
                cand_gen_knn = knn_index.get_knn_limited_index(
                    midxs,
                    include_index_idxs=np.arange(metadata.num_entities),
                    k=args.k)
                linking_graph_edges.extend([
                    tuple(sorted((a, b))) for a, l in zip(midxs, cand_gen_knn)
                    for b in l
                ])

            # get all of the edges
            linking_graph_edges = unique(linking_graph_edges)

        # broadcast edges to all processes to compute affinities
        linking_graph_edges = broadcast(linking_graph_edges, src=0)
        affinities = sub_trainer.get_edge_affinities(linking_graph_edges,
                                                     example_dir, knn_index)

        # affinities are gathered to only rank 0 process
        if get_rank() == 0:
            # build the graph
            linking_graph_edges = np.asarray(linking_graph_edges).T
            _sparse_num = metadata.num_mentions + metadata.num_entities
            linking_graph = coo_matrix((affinities, linking_graph_edges),
                                       shape=(_sparse_num, _sparse_num))

        if args.available_entities == 'knn_candidates':
            assert args.clustering_domain == 'within_doc'

            # pick expansion edges based on coref knn mentions
            expansion_factor = 5
            expansion_edges = []
            if get_rank() == 0:

                def _get_top_k(midx, graph, k):
                    row_entries = graph.getrow(midx).tocoo()
                    col_entries = graph.getcol(midx).tocoo()
                    entries = zip(
                        np.concatenate((row_entries.col, col_entries.row),
                                       axis=0),
                        np.concatenate((row_entries.data, col_entries.data),
                                       axis=0))
                    entries = list(entries)
                    if len(entries) == 0:
                        return []

                    sorted_data = sorted(entries,
                                         key=lambda x: x[1],
                                         reverse=True)
                    top_k, _ = zip(*sorted_data[:k])
                    return top_k

                top_k_coref = lambda i: _get_top_k(i, coref_graph,
                                                   expansion_factor)
                top_k_linking = lambda i: _get_top_k(i, linking_graph,
                                                     expansion_factor)
                for midx in midxs:
                    for coref_midx in top_k_coref(midx):
                        expansion_edges.extend([
                            tuple(sorted((x, midx)))
                            for x in top_k_linking(coref_midx)
                            if x not in metadata.midx2cand[midx]
                        ])
                expansion_edges = unique(expansion_edges)

            # score the expanded candidate edges
            expansion_edges = broadcast(expansion_edges, src=0)
            expansion_affinities = sub_trainer.get_edge_affinities(
                expansion_edges, example_dir, knn_index)

            if get_rank() == 0:
                # build the graph
                expansion_edges = np.asarray(expansion_edges).T
                linking_graph_edges = np.concatenate(
                    (linking_graph_edges, expansion_edges), axis=1)
                affinities += expansion_affinities
                _sparse_num = metadata.num_mentions + metadata.num_entities
                linking_graph = coo_matrix((affinities, linking_graph_edges),
                                           shape=(_sparse_num, _sparse_num))

    return coref_graph, linking_graph