Example #1
0
 def _synchronize_lists(_embeds_list, _idxs_list):
     gathered_data = all_gather({
         'embeds_list': _embeds_list,
         'idxs_list': _idxs_list,
     })
     if get_rank() == 0:
         _embeds_list = [d['embeds_list'] for d in gathered_data]
         _embeds_list = flatten(_embeds_list)
         _embeds_list = [x.cpu() for x in _embeds_list]
         _idxs_list = [d['idxs_list'] for d in gathered_data]
         _idxs_list = flatten(_idxs_list)
         _idxs_list = [x.cpu() for x in _idxs_list]
         master_embeds_list.extend(_embeds_list)
         master_idxs_list.extend(_idxs_list)
     synchronize()
     return [], []
Example #2
0
def _do_open_dlg(func, window, title, filters, flags):
    assert isinstance(title, unicode)

    buf = create_unicode_buffer(1024)
    ofn = OPENFILENAME()

    ofn.lStructSize = sizeof(OPENFILENAME)
    ofn.lpstrFile = cast(pointer(buf), LPWSTR)
    ofn.nMaxFile = 1024
    ofn.lpstrTitle = c_wchar_p(title)
    ofn.flags = flags
    
    if window:
        ofn.hwndOwner = window._hwnd

    filters = flatten(filters) or [u'All files(*.*)', u'*.*']
    assert all([isinstance(i, unicode) for i in filters])
    assert len(filters) % 2 == 0

    filters = u'\0'.join(filters) + u'\0\0'
    ofn.lpstrFilter = c_wchar_p(filters)

    func(byref(ofn))

    rst = buf[:].strip('\0')
    if flags & OFN_ALLOWMULTISELECT:
        return rst.split('\0')
    else:
        return rst
Example #3
0
def _do_open_dlg(func, window, title, filters, flags):
    assert isinstance(title, unicode)

    buf = create_unicode_buffer(1024)
    ofn = OPENFILENAME()

    ofn.lStructSize = sizeof(OPENFILENAME)
    ofn.lpstrFile = cast(pointer(buf), LPWSTR)
    ofn.nMaxFile = 1024
    ofn.lpstrTitle = c_wchar_p(title)
    ofn.flags = flags

    if window:
        ofn.hwndOwner = window._hwnd

    filters = flatten(filters) or [u"All files(*.*)", u"*.*"]
    assert all([isinstance(i, unicode) for i in filters])
    assert len(filters) % 2 == 0

    filters = u"\0".join(filters) + u"\0\0"
    ofn.lpstrFilter = c_wchar_p(filters)

    func(byref(ofn))

    rst = buf[:].strip("\0")
    if flags & OFN_ALLOWMULTISELECT:
        return rst.split("\0")
    else:
        return rst
Example #4
0
    def compute_scores_for_inference(self, clusters_mx, per_example_negs):
        # TODO: add description here
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        # create dataset and dataloader
        dataset = InferenceEmbeddingDataset(args, examples,
                                            args.train_cache_dir)
        dataloader = InferenceEmbeddingDataLoader(args, dataset)

        # get the unique idxs and embeds for each idx
        idxs, embeds = self.get_embeddings(dataloader, evaluate=False)

        sparse_graph = None
        if get_rank() == 0:
            # create inverse index for mapping
            inverse_idxs = {v: k for k, v in enumerate(idxs)}

            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [
                np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]])
                for i, j in edges
            ]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
Example #5
0
def get_triggered_replies(text, chat_id):
    chat_trigger_groups = TriggerGroup.query\
        .filter(TriggerGroup.chat_id == chat_id)\
        .all()
    answers_from_triggered = (
        (a.text for a in group.answers)
        for group in get_triggered_groups(chat_trigger_groups, text))
    return list(flatten(answers_from_triggered))
Example #6
0
    def test_flatten_only_string_provided(self):
        nested_structure = 'some string'

        output = flatten(nested_structure)
        temp = []
        for thing in output:
            temp.append(thing)

        expected = ['some string']
        assert temp == expected
Example #7
0
    def test_flatten_only_string_provided(self):
        nested_structure = 'some string'

        output = flatten(nested_structure)
        temp = []
        for thing in output:
            temp.append(thing)

        expected = ['some string']
        assert temp == expected
Example #8
0
    def test_flatten_list_of_lists(self):
        nested_structure = [[1, 2, 3], [3, 'hello']]

        output = flatten(nested_structure)
        temp = []
        for thing in output:
            temp.append(thing)

        expected = [1, 2, 3, 3, 'hello']

        assert temp == expected
Example #9
0
    def test_flatten_list_of_lists(self):
        nested_structure = [[1, 2, 3], [3, 'hello']]

        output = flatten(nested_structure)
        temp = []
        for thing in output:
            temp.append(thing)

        expected = [1, 2, 3, 3, 'hello']

        assert temp == expected
Example #10
0
    def _train_softmax(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        criterion = nn.CrossEntropyLoss()
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = SoftmaxEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] *
                                              outputs[:, 1:, :],
                                              dim=-1)
                target = torch.zeros(pos_neg_dot_prods.shape[0],
                                     dtype=torch.long).cuda()
                loss = criterion(pos_neg_dot_prods, target)
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Example #11
0
    def _train_threshold(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []

        self.model.train()
        self.model.zero_grad()
        random.shuffle(dataset_list)
        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = ScaledPairsEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[2],
                    'attention_mask': batch[3],
                    'token_type_ids': batch[4],
                    'concat_input': False
                }
                outputs = self.model(**inputs)
                dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :],
                                      dim=-1)
                loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods)))
                losses.append(loss.item())
                loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           args.max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            self.model.zero_grad()
            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'losses': losses,
        })

        if get_rank() == 0:
            losses = flatten([d['losses'] for d in gathered_data])
            loss = np.mean(losses)

            synchronize()
            return {'embed_loss': loss}
        else:
            synchronize()
            return None
Example #12
0
    def create_val_dataloader(self):
        args = self.args

        # load and cache examples and get the metadata for the dataset
        self.load_and_cache_examples(split='val', evaluate=True)

        if args.available_entities in ['candidates_only', 'knn_candidates']:
            examples = flatten(
                [[k] + v for k, v in self.val_metadata.midx2cand.items()])
        elif args.available_entities == 'open_domain':
            examples = list(self.val_metadata.idx2uid.keys())
        else:
            raise ValueError('Invalid available_entities')
        examples = unique(examples)
        self.val_dataset = InferenceEmbeddingDataset(args, examples,
                                                     args.val_cache_dir)
        self.val_dataloader = InferenceEmbeddingDataLoader(
            args, self.val_dataset)
Example #13
0
    def _build_temp_sparse_graph(self, clusters_mx, per_example_negs):
        args = self.args

        # get all of the unique examples
        examples = clusters_mx.data.tolist()
        examples.extend(flatten(per_example_negs.tolist()))
        examples = unique(examples)
        examples = list(filter(lambda x: x >= 0, examples))

        sparse_graph = None
        if get_rank() == 0:
            ## make the list of pairs of dot products we need
            _row = clusters_mx.row
            # positives:
            local_pos_a, local_pos_b = np.where(
                np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1))
            pos_a = clusters_mx.data[local_pos_a]
            pos_b = clusters_mx.data[local_pos_b]
            # negatives:
            local_neg_a = np.tile(
                np.arange(per_example_negs.shape[0])[:, np.newaxis],
                (1, per_example_negs.shape[1])).flatten()
            neg_a = clusters_mx.data[local_neg_a]
            neg_b = per_example_negs.flatten()

            neg_mask = (neg_b != -1)
            neg_a = neg_a[neg_mask]
            neg_b = neg_b[neg_mask]

            # create subset of the sparse graph we care about
            a = np.concatenate((pos_a, neg_a), axis=0)
            b = np.concatenate((pos_b, neg_b), axis=0)
            edges = list(zip(a, b))
            affinities = [0.0 for i, j in edges]

            # convert to coo_matrix
            edges = np.asarray(edges).T
            affinities = np.asarray(affinities)
            _sparse_num = np.max(edges) + 1
            sparse_graph = coo_matrix((affinities, edges),
                                      shape=(_sparse_num, _sparse_num))

        synchronize()
        return sparse_graph
Example #14
0
    def _neg_choosing_prep(self):
        args = self.args
        metadata = self.train_metadata

        # be able to map from midx to doc and back
        self.doc2midxs = defaultdict(list)
        self.midx2doc = {}
        for doc_id, wdoc_clusters in metadata.wdoc_clusters.items():
            mentions = flatten(wdoc_clusters.values())
            mentions = [x for x in mentions if x >= metadata.num_entities]
            self.doc2midxs[doc_id] = mentions
            for midx in mentions:
                self.midx2doc[midx] = doc_id

        # need to know the available entities in this case as well
        if args.available_entities not in [
                'candidates_only', 'knn_candidates'
        ]:
            self.avail_entity_idxs = list(range(num_entities))
Example #15
0
    def create_train_dataloader(self):
        args = self.args

        # load and cache examples and get the metadata for the dataset
        self.load_and_cache_examples(split='train', evaluate=False)

        # determine the set of gold clusters depending on the setting
        if args.clustering_domain == 'within_doc':
            clusters = flatten([
                list(doc.values())
                for doc in self.train_metadata.wdoc_clusters.values()
            ])
        elif args.clustering_domain == 'cross_doc':
            clusters = list(self.train_metadata.xdoc_clusters.values())
        else:
            raise ValueError('Invalid clustering_domain')

        self.train_dataset = MetaClusterDataset(clusters)
        self.train_dataloader = MetaClusterDataLoader(args, self.train_dataset)
Example #16
0
def compute_coref_metrics(gold_coref_clusters, coref_graphs, coref_threshold):
    global_gold_clustering = flatten(gold_coref_clusters)
    global_maximum_spanning_tree = _get_global_maximum_spanning_tree(
        coref_graphs)

    # compute metrics and choose threshold is one isn't specified
    if coref_threshold is None:
        logger.info('Generating candidate thresholds...')
        _edge_weights = global_maximum_spanning_tree.data.reshape(-1, 1)
        _num_thresholds = 1000
        if _edge_weights.shape[0] < _num_thresholds:
            candidate_thresholds = _edge_weights.reshape(-1, ).tolist()
        else:
            kmeans = KMeans(n_clusters=_num_thresholds, random_state=0)
            kmeans.fit(global_maximum_spanning_tree.data.reshape(-1, 1))
            candidate_thresholds = kmeans.cluster_centers_.reshape(
                -1, ).tolist()
        logger.info('Done.')

        logger.info('Choosing threshold...')
        threshold_results = []
        for _threshold in tqdm(candidate_thresholds):
            _metrics = _compute_coref_metrics_threshold(
                global_gold_clustering, global_maximum_spanning_tree,
                _threshold)
            threshold_results.append((_threshold, _metrics))
        logger.info('Done.')
        max_threshold_results = max(threshold_results,
                                    key=lambda x: x[1]['rand_index'])

        coref_results = max_threshold_results[1]
        coref_results['threshold'] = max_threshold_results[0]
    else:
        coref_results = _compute_coref_metrics_threshold(
            global_gold_clustering, global_maximum_spanning_tree,
            coref_threshold)
        coref_results['threshold'] = coref_threshold

    return coref_results
Example #17
0
def _do_open_dlg(window, title, filters, extra):
    args = ['zenity', '--file-selection', '--title', title] + extra
    args += flatten([('--file-filter', u'|'.join(i)) for i in filters])
    handle = subprocess.Popen(args, stdout=subprocess.PIPE)
    return handle.stdout.read().strip()
Example #18
0
def build_sparse_affinity_graph(args,
                                midxs,
                                example_dir,
                                metadata,
                                knn_index,
                                sub_trainer,
                                build_coref_graph=False,
                                build_linking_graph=False):

    assert build_coref_graph or build_linking_graph

    coref_graph = None
    linking_graph = None
    if get_rank() == 0:
        mention_knn = None
        if build_coref_graph or args.available_entities == 'knn_candidates':
            ## get all of the mention kNN
            #mention_knn = knn_index.get_knn_limited_index(
            #        midxs, include_index_idxs=midxs, k=args.k+1
            #)
            #mention_knn = mention_knn[:,1:]
            midx2doc = {}
            doc2midx = defaultdict(list)
            for doc_id, wdoc_clusters in metadata.wdoc_clusters.items():
                doc2midx[doc_id] = flatten(list(wdoc_clusters.values()))
                for midx in doc2midx[doc_id]:
                    midx2doc[midx] = doc_id
            mention_knn = []
            for midx in midxs:
                mention_knn.append([
                    x for x in doc2midx[midx2doc[midx]]
                    if x != midx and x >= args.num_entities
                ])

    if build_coref_graph:
        # list of edges for sparse graph we will build
        coref_graph_edges = []
        if get_rank() == 0:
            # add mention-mention edges to list
            coref_graph_edges.extend([
                tuple(sorted((a, b))) for a, l in zip(midxs, mention_knn)
                for b in l if a != b
            ])
            coref_graph_edges = unique(coref_graph_edges)

        # broadcast edges to all processes to compute affinities
        coref_graph_edges = broadcast(coref_graph_edges, src=0)
        affinities = sub_trainer.get_edge_affinities(coref_graph_edges,
                                                     example_dir, knn_index)

        # affinities are gathered to only rank 0 process
        if get_rank() == 0:
            # build the graph
            coref_graph_edges = np.asarray(coref_graph_edges).T
            _sparse_num = metadata.num_mentions + metadata.num_entities
            coref_graph = coo_matrix((affinities, coref_graph_edges),
                                     shape=(_sparse_num, _sparse_num))

    if build_linking_graph:
        # list of edges for sparse graph we will build
        linking_graph_edges = []
        if get_rank() == 0:
            # get mention-entity edges
            if args.available_entities == 'candidates_only':
                for midx in midxs:
                    candidates = metadata.midx2cand.get(midx, [])
                    if len(candidates) > 0:
                        linking_graph_edges.extend([
                            tuple(sorted((midx, eidx))) for eidx in candidates
                        ])
            elif args.available_entities == 'knn_candidates':
                # get all of the mention kNN
                if args.clustering_domain == 'within_doc':
                    for midx in midxs:
                        candidates = metadata.midx2cand.get(midx, [])
                        if len(candidates) > 0:
                            linking_graph_edges.extend([
                                tuple(sorted((midx, eidx)))
                                for eidx in candidates
                            ])
                elif args.clustering_domain == 'cross_doc':
                    raise NotImplementedError('unsupported clustering_domain')
                else:
                    raise ValueError('unsupported clustering_domain')
            else:  # 'open_domain'
                # get all of the mention kNN
                cand_gen_knn = knn_index.get_knn_limited_index(
                    midxs,
                    include_index_idxs=np.arange(metadata.num_entities),
                    k=args.k)
                linking_graph_edges.extend([
                    tuple(sorted((a, b))) for a, l in zip(midxs, cand_gen_knn)
                    for b in l
                ])

            # get all of the edges
            linking_graph_edges = unique(linking_graph_edges)

        # broadcast edges to all processes to compute affinities
        linking_graph_edges = broadcast(linking_graph_edges, src=0)
        affinities = sub_trainer.get_edge_affinities(linking_graph_edges,
                                                     example_dir, knn_index)

        # affinities are gathered to only rank 0 process
        if get_rank() == 0:
            # build the graph
            linking_graph_edges = np.asarray(linking_graph_edges).T
            _sparse_num = metadata.num_mentions + metadata.num_entities
            linking_graph = coo_matrix((affinities, linking_graph_edges),
                                       shape=(_sparse_num, _sparse_num))

        if args.available_entities == 'knn_candidates':
            assert args.clustering_domain == 'within_doc'

            # pick expansion edges based on coref knn mentions
            expansion_factor = 5
            expansion_edges = []
            if get_rank() == 0:

                def _get_top_k(midx, graph, k):
                    row_entries = graph.getrow(midx).tocoo()
                    col_entries = graph.getcol(midx).tocoo()
                    entries = zip(
                        np.concatenate((row_entries.col, col_entries.row),
                                       axis=0),
                        np.concatenate((row_entries.data, col_entries.data),
                                       axis=0))
                    entries = list(entries)
                    if len(entries) == 0:
                        return []

                    sorted_data = sorted(entries,
                                         key=lambda x: x[1],
                                         reverse=True)
                    top_k, _ = zip(*sorted_data[:k])
                    return top_k

                top_k_coref = lambda i: _get_top_k(i, coref_graph,
                                                   expansion_factor)
                top_k_linking = lambda i: _get_top_k(i, linking_graph,
                                                     expansion_factor)
                for midx in midxs:
                    for coref_midx in top_k_coref(midx):
                        expansion_edges.extend([
                            tuple(sorted((x, midx)))
                            for x in top_k_linking(coref_midx)
                            if x not in metadata.midx2cand[midx]
                        ])
                expansion_edges = unique(expansion_edges)

            # score the expanded candidate edges
            expansion_edges = broadcast(expansion_edges, src=0)
            expansion_affinities = sub_trainer.get_edge_affinities(
                expansion_edges, example_dir, knn_index)

            if get_rank() == 0:
                # build the graph
                expansion_edges = np.asarray(expansion_edges).T
                linking_graph_edges = np.concatenate(
                    (linking_graph_edges, expansion_edges), axis=1)
                affinities += expansion_affinities
                _sparse_num = metadata.num_mentions + metadata.num_entities
                linking_graph = coo_matrix((affinities, linking_graph_edges),
                                           shape=(_sparse_num, _sparse_num))

    return coref_graph, linking_graph
Example #19
0
    def _train_triplet(self, dataset_list, metadata):
        args = self.args

        losses = []
        time_per_dataset = []
        dataset_sizes = []
        pos_m_neg_m_losses = []
        pos_m_neg_e_losses = []
        pos_e_neg_m_losses = []
        pos_e_neg_e_losses = []

        self.model.train()
        self.model.zero_grad()

        for dataset in dataset_list:
            _dataset_start_time = time.time()
            dataset_sizes.append(len(dataset))
            dataloader = TripletEmbeddingDataLoader(args, dataset)
            for batch in dataloader:
                batch = tuple(t.to(args.device) for t in batch)
                inputs = {
                    'input_ids': batch[1],
                    'attention_mask': batch[2],
                    'token_type_ids': batch[3],
                    'concat_input': False
                }
                outputs = self.model(**inputs)

                pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] *
                                              outputs[:, 1:, :],
                                              dim=-1)

                if args.training_method == 'triplet_max_margin':
                    # max-margin
                    per_triplet_loss = F.relu(
                        pos_neg_dot_prods[:, 1]  # negative dot products
                        - pos_neg_dot_prods[:, 0]  # positive dot products
                        + args.margin)
                elif args.training_method == 'triplet_bpr':
                    # BPR
                    per_triplet_loss = torch.sigmoid(
                        pos_neg_dot_prods[:, 1]  # negative dot products
                        - pos_neg_dot_prods[:, 0]  # positive dot products
                        + args.margin)
                else:
                    raise ValueError('unsupported training_method')

                # record triplet specific losses
                _detached_per_triplet_loss = per_triplet_loss.clone().detach(
                ).cpu()
                _mask = batch[0] < metadata.num_entities
                pos_m_neg_m_mask = ~_mask[:, 1] & ~_mask[:, 2]
                pos_m_neg_e_mask = ~_mask[:, 1] & _mask[:, 2]
                pos_e_neg_m_mask = _mask[:, 1] & ~_mask[:, 2]
                pos_e_neg_e_mask = _mask[:, 1] & _mask[:, 2]

                pos_m_neg_m_losses.extend(
                    _detached_per_triplet_loss[pos_m_neg_m_mask].numpy(
                    ).tolist())
                pos_m_neg_e_losses.extend(
                    _detached_per_triplet_loss[pos_m_neg_e_mask].numpy(
                    ).tolist())
                pos_e_neg_m_losses.extend(
                    _detached_per_triplet_loss[pos_e_neg_m_mask].numpy(
                    ).tolist())
                pos_e_neg_e_losses.extend(
                    _detached_per_triplet_loss[pos_e_neg_e_mask].numpy(
                    ).tolist())
                loss = torch.mean(per_triplet_loss)
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               args.max_grad_norm)
                self.optimizer.step()
                self.scheduler.step()
                self.model.zero_grad()

            time_per_dataset.append(time.time() - _dataset_start_time)

        gathered_data = all_gather({
            'pos_m_neg_m_losses': pos_m_neg_m_losses,
            'pos_m_neg_e_losses': pos_m_neg_e_losses,
            'pos_e_neg_m_losses': pos_e_neg_m_losses,
            'pos_e_neg_e_losses': pos_e_neg_e_losses
        })

        if get_rank() == 0:
            pos_m_neg_m_losses = flatten(
                [d['pos_m_neg_m_losses'] for d in gathered_data])
            pos_m_neg_e_losses = flatten(
                [d['pos_m_neg_e_losses'] for d in gathered_data])
            pos_e_neg_m_losses = flatten(
                [d['pos_e_neg_m_losses'] for d in gathered_data])
            pos_e_neg_e_losses = flatten(
                [d['pos_e_neg_e_losses'] for d in gathered_data])
            losses = pos_m_neg_m_losses + pos_m_neg_e_losses + pos_e_neg_m_losses + pos_e_neg_e_losses

            pos_m_neg_m_loss = 0.0 if len(
                pos_m_neg_m_losses) == 0 else np.mean(pos_m_neg_m_losses)
            pos_m_neg_e_loss = 0.0 if len(
                pos_m_neg_e_losses) == 0 else np.mean(pos_m_neg_e_losses)
            pos_e_neg_m_loss = 0.0 if len(
                pos_e_neg_m_losses) == 0 else np.mean(pos_e_neg_m_losses)
            pos_e_neg_e_loss = 0.0 if len(
                pos_e_neg_e_losses) == 0 else np.mean(pos_e_neg_e_losses)
            loss = np.mean(losses)

            synchronize()
            return {
                'embed_loss': loss,
                'embed_num_examples': len(losses),
                'embed_pos_m_neg_m_loss': pos_m_neg_m_loss,
                'embed_pos_m_neg_e_loss': pos_m_neg_e_loss,
                'embed_pos_e_neg_m_loss': pos_e_neg_m_loss,
                'embed_pos_e_neg_e_loss': pos_e_neg_e_loss,
                'embed_pos_m_neg_m_num_examples': len(pos_m_neg_m_losses),
                'embed_pos_m_neg_e_num_examples': len(pos_m_neg_e_losses),
                'embed_pos_e_neg_m_num_examples': len(pos_e_neg_m_losses),
                'embed_pos_e_neg_e_num_examples': len(pos_e_neg_e_losses)
            }
        else:
            synchronize()
            return None
Example #20
0
    def _choose_negs(self, batch):
        args = self.args
        negatives_list = []
        clusters = [
            sorted(batch.getrow(i).data.tolist())
            for i in range(batch.shape[0])
        ]
        for c_idxs in clusters:
            if args.clustering_domain == 'within_doc':
                # get mention idxs within document
                doc_midxs = self.doc2midxs[self.midx2doc[c_idxs[1]]]

                # produce available negative mention idxs
                neg_midxs = [m for m in doc_midxs if m not in c_idxs]

                # initialize the negs tensors
                negs = np.tile(-1, (len(c_idxs), args.k))

                # determine the number of mention negatives
                if args.training_edges_considered != 'm-e':
                    num_m_negs = min(args.k // 2, len(neg_midxs))
                else:
                    num_m_negs = 0

                if args.mention_negatives == 'context_overlap':
                    # use overlapping context negative mentions
                    neg_midxs_objects = [(midx, self.train_metadata.mentions[
                        self.train_metadata.idx2uid[midx]])
                                         for midx in neg_midxs]
                    for i, idx in enumerate(c_idxs):
                        if idx >= self.train_metadata.num_entities:
                            idx_object = self.train_metadata.mentions[
                                self.train_metadata.idx2uid[idx]]
                            neg_context_dists = [
                                (x[0],
                                 abs(idx_object['start_index'] -
                                     x[1]['start_index']))
                                for x in neg_midxs_objects
                            ]
                            neg_context_dists.sort(key=lambda x: x[1])
                            local_neg_midxs, _ = zip(*neg_context_dists)
                            negs[i, :num_m_negs] = np.asarray(
                                local_neg_midxs[:num_m_negs])
                elif args.mention_negatives == 'random':
                    for i, idx in enumerate(c_idxs):
                        if idx >= self.train_metadata.num_entities:
                            negs[i, :num_m_negs] = random.sample(
                                neg_midxs, num_m_negs)
                else:
                    # sample mention negatives according to embedding model
                    negs[:, :
                         num_m_negs] = self.train_knn_index.get_knn_limited_index(
                             c_idxs,
                             include_index_idxs=neg_midxs,
                             k=num_m_negs)

                # produce available negative entity idxs
                # NOTE: this doesn't allow negative e-e edges (there are never any positive ones)
                if args.training_edges_considered != 'm-m':
                    num_e_negs = args.k - num_m_negs
                    if args.available_entities == 'candidates_only':
                        neg_eidxs = [
                            list(
                                filter(
                                    lambda x: x != c_idxs[0],
                                    self.train_metadata.midx2cand.get(
                                        i, [])))[:num_e_negs] for i in c_idxs
                            if i >= self.train_metadata.num_entities
                        ]
                        neg_eidxs = [
                            l + [-1] * (num_e_negs - len(l)) for l in neg_eidxs
                        ]
                        negs[1:, -num_e_negs:] = np.asarray(neg_eidxs)
                    else:
                        if (args.clustering_domain == 'within_doc' and
                                args.available_entities == 'knn_candidates'):
                            # custom w/in doc negative available entities
                            self.avail_entity_idxs = flatten([
                                list(
                                    filter(
                                        lambda x: x != c_idxs[0],
                                        self.train_metadata.midx2cand.get(
                                            i, [])))
                                for i in (c_idxs + neg_midxs)
                                if i >= self.train_metadata.num_entities
                            ])

                        _entity_knn_negs = self.train_knn_index.get_knn_limited_index(
                            c_idxs[1:],
                            include_index_idxs=self.avail_entity_idxs,
                            k=min(num_e_negs, len(self.avail_entity_idxs)))

                        if _entity_knn_negs.shape[1] < num_e_negs:
                            assert _entity_knn_negs.shape[1] == len(
                                self.avail_entity_idxs)
                            _buff = _entity_knn_negs.shape[1] - num_e_negs
                            negs[1:, -num_e_negs:_buff] = _entity_knn_negs
                        else:
                            negs[1:, -num_e_negs:] = _entity_knn_negs

                negatives_list.append(negs)

            else:
                raise NotImplementedError(
                    'xdoc neg sampling not implemented yet')
                # produce knn negatives for cluster and append to list

        negs = np.concatenate(negatives_list, axis=0)
        return negs