def _synchronize_lists(_embeds_list, _idxs_list): gathered_data = all_gather({ 'embeds_list': _embeds_list, 'idxs_list': _idxs_list, }) if get_rank() == 0: _embeds_list = [d['embeds_list'] for d in gathered_data] _embeds_list = flatten(_embeds_list) _embeds_list = [x.cpu() for x in _embeds_list] _idxs_list = [d['idxs_list'] for d in gathered_data] _idxs_list = flatten(_idxs_list) _idxs_list = [x.cpu() for x in _idxs_list] master_embeds_list.extend(_embeds_list) master_idxs_list.extend(_idxs_list) synchronize() return [], []
def _do_open_dlg(func, window, title, filters, flags): assert isinstance(title, unicode) buf = create_unicode_buffer(1024) ofn = OPENFILENAME() ofn.lStructSize = sizeof(OPENFILENAME) ofn.lpstrFile = cast(pointer(buf), LPWSTR) ofn.nMaxFile = 1024 ofn.lpstrTitle = c_wchar_p(title) ofn.flags = flags if window: ofn.hwndOwner = window._hwnd filters = flatten(filters) or [u'All files(*.*)', u'*.*'] assert all([isinstance(i, unicode) for i in filters]) assert len(filters) % 2 == 0 filters = u'\0'.join(filters) + u'\0\0' ofn.lpstrFilter = c_wchar_p(filters) func(byref(ofn)) rst = buf[:].strip('\0') if flags & OFN_ALLOWMULTISELECT: return rst.split('\0') else: return rst
def _do_open_dlg(func, window, title, filters, flags): assert isinstance(title, unicode) buf = create_unicode_buffer(1024) ofn = OPENFILENAME() ofn.lStructSize = sizeof(OPENFILENAME) ofn.lpstrFile = cast(pointer(buf), LPWSTR) ofn.nMaxFile = 1024 ofn.lpstrTitle = c_wchar_p(title) ofn.flags = flags if window: ofn.hwndOwner = window._hwnd filters = flatten(filters) or [u"All files(*.*)", u"*.*"] assert all([isinstance(i, unicode) for i in filters]) assert len(filters) % 2 == 0 filters = u"\0".join(filters) + u"\0\0" ofn.lpstrFilter = c_wchar_p(filters) func(byref(ofn)) rst = buf[:].strip("\0") if flags & OFN_ALLOWMULTISELECT: return rst.split("\0") else: return rst
def compute_scores_for_inference(self, clusters_mx, per_example_negs): # TODO: add description here args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) # create dataset and dataloader dataset = InferenceEmbeddingDataset(args, examples, args.train_cache_dir) dataloader = InferenceEmbeddingDataLoader(args, dataset) # get the unique idxs and embeds for each idx idxs, embeds = self.get_embeddings(dataloader, evaluate=False) sparse_graph = None if get_rank() == 0: # create inverse index for mapping inverse_idxs = {v: k for k, v in enumerate(idxs)} ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [ np.dot(embeds[inverse_idxs[i]], embeds[inverse_idxs[j]]) for i, j in edges ] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def get_triggered_replies(text, chat_id): chat_trigger_groups = TriggerGroup.query\ .filter(TriggerGroup.chat_id == chat_id)\ .all() answers_from_triggered = ( (a.text for a in group.answers) for group in get_triggered_groups(chat_trigger_groups, text)) return list(flatten(answers_from_triggered))
def test_flatten_only_string_provided(self): nested_structure = 'some string' output = flatten(nested_structure) temp = [] for thing in output: temp.append(thing) expected = ['some string'] assert temp == expected
def test_flatten_list_of_lists(self): nested_structure = [[1, 2, 3], [3, 'hello']] output = flatten(nested_structure) temp = [] for thing in output: temp.append(thing) expected = [1, 2, 3, 3, 'hello'] assert temp == expected
def _train_softmax(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() criterion = nn.CrossEntropyLoss() for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = SoftmaxEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } outputs = self.model(**inputs) pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] * outputs[:, 1:, :], dim=-1) target = torch.zeros(pos_neg_dot_prods.shape[0], dtype=torch.long).cuda() loss = criterion(pos_neg_dot_prods, target) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def _train_threshold(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] self.model.train() self.model.zero_grad() random.shuffle(dataset_list) for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = ScaledPairsEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[2], 'attention_mask': batch[3], 'token_type_ids': batch[4], 'concat_input': False } outputs = self.model(**inputs) dot_prods = torch.sum(outputs[:, 0, :] * outputs[:, 1, :], dim=-1) loss = torch.mean(F.relu(args.margin - (batch[0] * dot_prods))) losses.append(loss.item()) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'losses': losses, }) if get_rank() == 0: losses = flatten([d['losses'] for d in gathered_data]) loss = np.mean(losses) synchronize() return {'embed_loss': loss} else: synchronize() return None
def create_val_dataloader(self): args = self.args # load and cache examples and get the metadata for the dataset self.load_and_cache_examples(split='val', evaluate=True) if args.available_entities in ['candidates_only', 'knn_candidates']: examples = flatten( [[k] + v for k, v in self.val_metadata.midx2cand.items()]) elif args.available_entities == 'open_domain': examples = list(self.val_metadata.idx2uid.keys()) else: raise ValueError('Invalid available_entities') examples = unique(examples) self.val_dataset = InferenceEmbeddingDataset(args, examples, args.val_cache_dir) self.val_dataloader = InferenceEmbeddingDataLoader( args, self.val_dataset)
def _build_temp_sparse_graph(self, clusters_mx, per_example_negs): args = self.args # get all of the unique examples examples = clusters_mx.data.tolist() examples.extend(flatten(per_example_negs.tolist())) examples = unique(examples) examples = list(filter(lambda x: x >= 0, examples)) sparse_graph = None if get_rank() == 0: ## make the list of pairs of dot products we need _row = clusters_mx.row # positives: local_pos_a, local_pos_b = np.where( np.triu(_row[np.newaxis, :] == _row[:, np.newaxis], k=1)) pos_a = clusters_mx.data[local_pos_a] pos_b = clusters_mx.data[local_pos_b] # negatives: local_neg_a = np.tile( np.arange(per_example_negs.shape[0])[:, np.newaxis], (1, per_example_negs.shape[1])).flatten() neg_a = clusters_mx.data[local_neg_a] neg_b = per_example_negs.flatten() neg_mask = (neg_b != -1) neg_a = neg_a[neg_mask] neg_b = neg_b[neg_mask] # create subset of the sparse graph we care about a = np.concatenate((pos_a, neg_a), axis=0) b = np.concatenate((pos_b, neg_b), axis=0) edges = list(zip(a, b)) affinities = [0.0 for i, j in edges] # convert to coo_matrix edges = np.asarray(edges).T affinities = np.asarray(affinities) _sparse_num = np.max(edges) + 1 sparse_graph = coo_matrix((affinities, edges), shape=(_sparse_num, _sparse_num)) synchronize() return sparse_graph
def _neg_choosing_prep(self): args = self.args metadata = self.train_metadata # be able to map from midx to doc and back self.doc2midxs = defaultdict(list) self.midx2doc = {} for doc_id, wdoc_clusters in metadata.wdoc_clusters.items(): mentions = flatten(wdoc_clusters.values()) mentions = [x for x in mentions if x >= metadata.num_entities] self.doc2midxs[doc_id] = mentions for midx in mentions: self.midx2doc[midx] = doc_id # need to know the available entities in this case as well if args.available_entities not in [ 'candidates_only', 'knn_candidates' ]: self.avail_entity_idxs = list(range(num_entities))
def create_train_dataloader(self): args = self.args # load and cache examples and get the metadata for the dataset self.load_and_cache_examples(split='train', evaluate=False) # determine the set of gold clusters depending on the setting if args.clustering_domain == 'within_doc': clusters = flatten([ list(doc.values()) for doc in self.train_metadata.wdoc_clusters.values() ]) elif args.clustering_domain == 'cross_doc': clusters = list(self.train_metadata.xdoc_clusters.values()) else: raise ValueError('Invalid clustering_domain') self.train_dataset = MetaClusterDataset(clusters) self.train_dataloader = MetaClusterDataLoader(args, self.train_dataset)
def compute_coref_metrics(gold_coref_clusters, coref_graphs, coref_threshold): global_gold_clustering = flatten(gold_coref_clusters) global_maximum_spanning_tree = _get_global_maximum_spanning_tree( coref_graphs) # compute metrics and choose threshold is one isn't specified if coref_threshold is None: logger.info('Generating candidate thresholds...') _edge_weights = global_maximum_spanning_tree.data.reshape(-1, 1) _num_thresholds = 1000 if _edge_weights.shape[0] < _num_thresholds: candidate_thresholds = _edge_weights.reshape(-1, ).tolist() else: kmeans = KMeans(n_clusters=_num_thresholds, random_state=0) kmeans.fit(global_maximum_spanning_tree.data.reshape(-1, 1)) candidate_thresholds = kmeans.cluster_centers_.reshape( -1, ).tolist() logger.info('Done.') logger.info('Choosing threshold...') threshold_results = [] for _threshold in tqdm(candidate_thresholds): _metrics = _compute_coref_metrics_threshold( global_gold_clustering, global_maximum_spanning_tree, _threshold) threshold_results.append((_threshold, _metrics)) logger.info('Done.') max_threshold_results = max(threshold_results, key=lambda x: x[1]['rand_index']) coref_results = max_threshold_results[1] coref_results['threshold'] = max_threshold_results[0] else: coref_results = _compute_coref_metrics_threshold( global_gold_clustering, global_maximum_spanning_tree, coref_threshold) coref_results['threshold'] = coref_threshold return coref_results
def _do_open_dlg(window, title, filters, extra): args = ['zenity', '--file-selection', '--title', title] + extra args += flatten([('--file-filter', u'|'.join(i)) for i in filters]) handle = subprocess.Popen(args, stdout=subprocess.PIPE) return handle.stdout.read().strip()
def build_sparse_affinity_graph(args, midxs, example_dir, metadata, knn_index, sub_trainer, build_coref_graph=False, build_linking_graph=False): assert build_coref_graph or build_linking_graph coref_graph = None linking_graph = None if get_rank() == 0: mention_knn = None if build_coref_graph or args.available_entities == 'knn_candidates': ## get all of the mention kNN #mention_knn = knn_index.get_knn_limited_index( # midxs, include_index_idxs=midxs, k=args.k+1 #) #mention_knn = mention_knn[:,1:] midx2doc = {} doc2midx = defaultdict(list) for doc_id, wdoc_clusters in metadata.wdoc_clusters.items(): doc2midx[doc_id] = flatten(list(wdoc_clusters.values())) for midx in doc2midx[doc_id]: midx2doc[midx] = doc_id mention_knn = [] for midx in midxs: mention_knn.append([ x for x in doc2midx[midx2doc[midx]] if x != midx and x >= args.num_entities ]) if build_coref_graph: # list of edges for sparse graph we will build coref_graph_edges = [] if get_rank() == 0: # add mention-mention edges to list coref_graph_edges.extend([ tuple(sorted((a, b))) for a, l in zip(midxs, mention_knn) for b in l if a != b ]) coref_graph_edges = unique(coref_graph_edges) # broadcast edges to all processes to compute affinities coref_graph_edges = broadcast(coref_graph_edges, src=0) affinities = sub_trainer.get_edge_affinities(coref_graph_edges, example_dir, knn_index) # affinities are gathered to only rank 0 process if get_rank() == 0: # build the graph coref_graph_edges = np.asarray(coref_graph_edges).T _sparse_num = metadata.num_mentions + metadata.num_entities coref_graph = coo_matrix((affinities, coref_graph_edges), shape=(_sparse_num, _sparse_num)) if build_linking_graph: # list of edges for sparse graph we will build linking_graph_edges = [] if get_rank() == 0: # get mention-entity edges if args.available_entities == 'candidates_only': for midx in midxs: candidates = metadata.midx2cand.get(midx, []) if len(candidates) > 0: linking_graph_edges.extend([ tuple(sorted((midx, eidx))) for eidx in candidates ]) elif args.available_entities == 'knn_candidates': # get all of the mention kNN if args.clustering_domain == 'within_doc': for midx in midxs: candidates = metadata.midx2cand.get(midx, []) if len(candidates) > 0: linking_graph_edges.extend([ tuple(sorted((midx, eidx))) for eidx in candidates ]) elif args.clustering_domain == 'cross_doc': raise NotImplementedError('unsupported clustering_domain') else: raise ValueError('unsupported clustering_domain') else: # 'open_domain' # get all of the mention kNN cand_gen_knn = knn_index.get_knn_limited_index( midxs, include_index_idxs=np.arange(metadata.num_entities), k=args.k) linking_graph_edges.extend([ tuple(sorted((a, b))) for a, l in zip(midxs, cand_gen_knn) for b in l ]) # get all of the edges linking_graph_edges = unique(linking_graph_edges) # broadcast edges to all processes to compute affinities linking_graph_edges = broadcast(linking_graph_edges, src=0) affinities = sub_trainer.get_edge_affinities(linking_graph_edges, example_dir, knn_index) # affinities are gathered to only rank 0 process if get_rank() == 0: # build the graph linking_graph_edges = np.asarray(linking_graph_edges).T _sparse_num = metadata.num_mentions + metadata.num_entities linking_graph = coo_matrix((affinities, linking_graph_edges), shape=(_sparse_num, _sparse_num)) if args.available_entities == 'knn_candidates': assert args.clustering_domain == 'within_doc' # pick expansion edges based on coref knn mentions expansion_factor = 5 expansion_edges = [] if get_rank() == 0: def _get_top_k(midx, graph, k): row_entries = graph.getrow(midx).tocoo() col_entries = graph.getcol(midx).tocoo() entries = zip( np.concatenate((row_entries.col, col_entries.row), axis=0), np.concatenate((row_entries.data, col_entries.data), axis=0)) entries = list(entries) if len(entries) == 0: return [] sorted_data = sorted(entries, key=lambda x: x[1], reverse=True) top_k, _ = zip(*sorted_data[:k]) return top_k top_k_coref = lambda i: _get_top_k(i, coref_graph, expansion_factor) top_k_linking = lambda i: _get_top_k(i, linking_graph, expansion_factor) for midx in midxs: for coref_midx in top_k_coref(midx): expansion_edges.extend([ tuple(sorted((x, midx))) for x in top_k_linking(coref_midx) if x not in metadata.midx2cand[midx] ]) expansion_edges = unique(expansion_edges) # score the expanded candidate edges expansion_edges = broadcast(expansion_edges, src=0) expansion_affinities = sub_trainer.get_edge_affinities( expansion_edges, example_dir, knn_index) if get_rank() == 0: # build the graph expansion_edges = np.asarray(expansion_edges).T linking_graph_edges = np.concatenate( (linking_graph_edges, expansion_edges), axis=1) affinities += expansion_affinities _sparse_num = metadata.num_mentions + metadata.num_entities linking_graph = coo_matrix((affinities, linking_graph_edges), shape=(_sparse_num, _sparse_num)) return coref_graph, linking_graph
def _train_triplet(self, dataset_list, metadata): args = self.args losses = [] time_per_dataset = [] dataset_sizes = [] pos_m_neg_m_losses = [] pos_m_neg_e_losses = [] pos_e_neg_m_losses = [] pos_e_neg_e_losses = [] self.model.train() self.model.zero_grad() for dataset in dataset_list: _dataset_start_time = time.time() dataset_sizes.append(len(dataset)) dataloader = TripletEmbeddingDataLoader(args, dataset) for batch in dataloader: batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'concat_input': False } outputs = self.model(**inputs) pos_neg_dot_prods = torch.sum(outputs[:, 0:1, :] * outputs[:, 1:, :], dim=-1) if args.training_method == 'triplet_max_margin': # max-margin per_triplet_loss = F.relu( pos_neg_dot_prods[:, 1] # negative dot products - pos_neg_dot_prods[:, 0] # positive dot products + args.margin) elif args.training_method == 'triplet_bpr': # BPR per_triplet_loss = torch.sigmoid( pos_neg_dot_prods[:, 1] # negative dot products - pos_neg_dot_prods[:, 0] # positive dot products + args.margin) else: raise ValueError('unsupported training_method') # record triplet specific losses _detached_per_triplet_loss = per_triplet_loss.clone().detach( ).cpu() _mask = batch[0] < metadata.num_entities pos_m_neg_m_mask = ~_mask[:, 1] & ~_mask[:, 2] pos_m_neg_e_mask = ~_mask[:, 1] & _mask[:, 2] pos_e_neg_m_mask = _mask[:, 1] & ~_mask[:, 2] pos_e_neg_e_mask = _mask[:, 1] & _mask[:, 2] pos_m_neg_m_losses.extend( _detached_per_triplet_loss[pos_m_neg_m_mask].numpy( ).tolist()) pos_m_neg_e_losses.extend( _detached_per_triplet_loss[pos_m_neg_e_mask].numpy( ).tolist()) pos_e_neg_m_losses.extend( _detached_per_triplet_loss[pos_e_neg_m_mask].numpy( ).tolist()) pos_e_neg_e_losses.extend( _detached_per_triplet_loss[pos_e_neg_e_mask].numpy( ).tolist()) loss = torch.mean(per_triplet_loss) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() time_per_dataset.append(time.time() - _dataset_start_time) gathered_data = all_gather({ 'pos_m_neg_m_losses': pos_m_neg_m_losses, 'pos_m_neg_e_losses': pos_m_neg_e_losses, 'pos_e_neg_m_losses': pos_e_neg_m_losses, 'pos_e_neg_e_losses': pos_e_neg_e_losses }) if get_rank() == 0: pos_m_neg_m_losses = flatten( [d['pos_m_neg_m_losses'] for d in gathered_data]) pos_m_neg_e_losses = flatten( [d['pos_m_neg_e_losses'] for d in gathered_data]) pos_e_neg_m_losses = flatten( [d['pos_e_neg_m_losses'] for d in gathered_data]) pos_e_neg_e_losses = flatten( [d['pos_e_neg_e_losses'] for d in gathered_data]) losses = pos_m_neg_m_losses + pos_m_neg_e_losses + pos_e_neg_m_losses + pos_e_neg_e_losses pos_m_neg_m_loss = 0.0 if len( pos_m_neg_m_losses) == 0 else np.mean(pos_m_neg_m_losses) pos_m_neg_e_loss = 0.0 if len( pos_m_neg_e_losses) == 0 else np.mean(pos_m_neg_e_losses) pos_e_neg_m_loss = 0.0 if len( pos_e_neg_m_losses) == 0 else np.mean(pos_e_neg_m_losses) pos_e_neg_e_loss = 0.0 if len( pos_e_neg_e_losses) == 0 else np.mean(pos_e_neg_e_losses) loss = np.mean(losses) synchronize() return { 'embed_loss': loss, 'embed_num_examples': len(losses), 'embed_pos_m_neg_m_loss': pos_m_neg_m_loss, 'embed_pos_m_neg_e_loss': pos_m_neg_e_loss, 'embed_pos_e_neg_m_loss': pos_e_neg_m_loss, 'embed_pos_e_neg_e_loss': pos_e_neg_e_loss, 'embed_pos_m_neg_m_num_examples': len(pos_m_neg_m_losses), 'embed_pos_m_neg_e_num_examples': len(pos_m_neg_e_losses), 'embed_pos_e_neg_m_num_examples': len(pos_e_neg_m_losses), 'embed_pos_e_neg_e_num_examples': len(pos_e_neg_e_losses) } else: synchronize() return None
def _choose_negs(self, batch): args = self.args negatives_list = [] clusters = [ sorted(batch.getrow(i).data.tolist()) for i in range(batch.shape[0]) ] for c_idxs in clusters: if args.clustering_domain == 'within_doc': # get mention idxs within document doc_midxs = self.doc2midxs[self.midx2doc[c_idxs[1]]] # produce available negative mention idxs neg_midxs = [m for m in doc_midxs if m not in c_idxs] # initialize the negs tensors negs = np.tile(-1, (len(c_idxs), args.k)) # determine the number of mention negatives if args.training_edges_considered != 'm-e': num_m_negs = min(args.k // 2, len(neg_midxs)) else: num_m_negs = 0 if args.mention_negatives == 'context_overlap': # use overlapping context negative mentions neg_midxs_objects = [(midx, self.train_metadata.mentions[ self.train_metadata.idx2uid[midx]]) for midx in neg_midxs] for i, idx in enumerate(c_idxs): if idx >= self.train_metadata.num_entities: idx_object = self.train_metadata.mentions[ self.train_metadata.idx2uid[idx]] neg_context_dists = [ (x[0], abs(idx_object['start_index'] - x[1]['start_index'])) for x in neg_midxs_objects ] neg_context_dists.sort(key=lambda x: x[1]) local_neg_midxs, _ = zip(*neg_context_dists) negs[i, :num_m_negs] = np.asarray( local_neg_midxs[:num_m_negs]) elif args.mention_negatives == 'random': for i, idx in enumerate(c_idxs): if idx >= self.train_metadata.num_entities: negs[i, :num_m_negs] = random.sample( neg_midxs, num_m_negs) else: # sample mention negatives according to embedding model negs[:, : num_m_negs] = self.train_knn_index.get_knn_limited_index( c_idxs, include_index_idxs=neg_midxs, k=num_m_negs) # produce available negative entity idxs # NOTE: this doesn't allow negative e-e edges (there are never any positive ones) if args.training_edges_considered != 'm-m': num_e_negs = args.k - num_m_negs if args.available_entities == 'candidates_only': neg_eidxs = [ list( filter( lambda x: x != c_idxs[0], self.train_metadata.midx2cand.get( i, [])))[:num_e_negs] for i in c_idxs if i >= self.train_metadata.num_entities ] neg_eidxs = [ l + [-1] * (num_e_negs - len(l)) for l in neg_eidxs ] negs[1:, -num_e_negs:] = np.asarray(neg_eidxs) else: if (args.clustering_domain == 'within_doc' and args.available_entities == 'knn_candidates'): # custom w/in doc negative available entities self.avail_entity_idxs = flatten([ list( filter( lambda x: x != c_idxs[0], self.train_metadata.midx2cand.get( i, []))) for i in (c_idxs + neg_midxs) if i >= self.train_metadata.num_entities ]) _entity_knn_negs = self.train_knn_index.get_knn_limited_index( c_idxs[1:], include_index_idxs=self.avail_entity_idxs, k=min(num_e_negs, len(self.avail_entity_idxs))) if _entity_knn_negs.shape[1] < num_e_negs: assert _entity_knn_negs.shape[1] == len( self.avail_entity_idxs) _buff = _entity_knn_negs.shape[1] - num_e_negs negs[1:, -num_e_negs:_buff] = _entity_knn_negs else: negs[1:, -num_e_negs:] = _entity_knn_negs negatives_list.append(negs) else: raise NotImplementedError( 'xdoc neg sampling not implemented yet') # produce knn negatives for cluster and append to list negs = np.concatenate(negatives_list, axis=0) return negs