def hungarian_tracking(video, cost=200, nms_overlap_fraction=0.6): getId = IntegerIDGen() # Supress bodies non_max_supression_video(video, nms_overlap_fraction) for i, body in enumerate(video[0].valid_bodies): body.set_id(getId()) video._tracks[body.id] = Track(body) # print(body) for i in tqdm(range(len(video) - 1)): current_frame = video[i].valid_bodies next_frame = video[i + 1].valid_bodies cmap = cost_matrix_tracks_skeleton(current_frame, next_frame, cost) _, idx = hungarian(cmap) for j in range(len(current_frame)): if cmap[j, idx[j]] < cost: # Create New ID if current_frame[j].id == -1: current_frame[j].set_id(getId()) video._tracks[current_frame[j].id] = Track( current_frame[j]) # Match Next Frame Detections next_frame[idx[j]].set_id(current_frame[j].id) next_frame[idx[j]].prev = current_frame[j] current_frame[j].next = next_frame[idx[j]] return
def optimal_assignment(self,gt_n_cluster=None,assign=None): if assign is None: mat = -self.conf.cpu().numpy() #hungaian finds the minimum cost r,assign = hungarian(mat) self.conf = self.conf[:,assign] self.gt_n_cluster = gt_n_cluster return self
def compute_hungarian(m): assert m.size()[0] == m.size()[1] m_numpy = m.cpu().detach().numpy() row, col = hungarian(m_numpy) matrix = np.zeros(m.size()) matrix[row, col] = 1. / float(len(m)) cost = (matrix * m_numpy).sum() return cost, torch.tensor(matrix), col
def count_disambiguations(tags0, kws0, candidate_keywords_per_tag, n_kws, n_tags): """Receives an initial (fixed) assignment of tags to keywords and computes how many disambiguations are solved current_map is a dictionary mapping tag to sets of candidate keywords""" current_map = candidate_keywords_per_tag.copy() for tag_id, kw_id in zip(tags0, kws0): current_map[tag_id] = [kw_id] ambiguous_tags = [tag_id for tag_id, candidates in current_map.items() if len(candidates) > 1] known_tags = [tag_id for tag_id, candidates in current_map.items() if len(candidates) == 1] fresh_pass = False while not fresh_pass and len(ambiguous_tags) > 1: fresh_pass = True for tag_id in ambiguous_tags: current_candidates = current_map[tag_id] new_candidates = [] for candidate_kw_id in current_candidates: check = [m_matrix[current_map[known_tag][0]][candidate_kw_id] - window <= m_obs_matrix[known_tag][tag_id] <= m_matrix[current_map[known_tag][0]][candidate_kw_id] + window for known_tag in known_tags] if all(check): new_candidates.append(candidate_kw_id) if len(current_candidates) > len(new_candidates): fresh_pass = False current_map[tag_id] = new_candidates # print(" Removed {:d} candidates".format(len(current_candidates) - len(new_candidates))) for tag_id in ambiguous_tags: if len(current_map[tag_id]) == 1: ambiguous_tags.remove(tag_id) known_tags.append(tag_id) # print(" tag {:d} is disambiguated".format(tag_id)) for tag_id_others in ambiguous_tags: if tag_id != tag_id_others and current_map[tag_id][0] in current_map[tag_id_others]: current_map[tag_id_others].remove(current_map[tag_id][0]) elif len(current_map[tag_id]) == 0: # print(" Inconsistency!") return 0, None # Inconsistency n_disambiguations = len(known_tags) cost_matrix = np.ones((n_kws, n_tags)) for i_tag in range(n_tags): for candidate_kw_id in current_map[i_tag]: cost_matrix[candidate_kw_id, i_tag] = 0 row_ind, col_ind = hungarian(cost_matrix) if cost_matrix[row_ind, col_ind].sum() > 0: # print(" There was no consistent matching!") return 0, None query_predictions_for_each_tag = {} for tag, keyword in zip(col_ind, row_ind): query_predictions_for_each_tag[tag] = keyword # print(" This matching has {:d} disambiguations, returning...".format(n_disambiguations)) return n_disambiguations, query_predictions_for_each_tag
def maximumANDSum(self, nums, numSlots): """ :type nums: List[int] :type numSlots: int :rtype: int """ adj = [[ -((nums[i] if i < len(nums) else 0) & (1 + x // 2)) for x in xrange(2 * numSlots) ] for i in xrange(2 * numSlots)] return -sum(adj[i][j] for i, j in itertools.izip(*hungarian(adj)))
def _run_hungarian_attack_given_matrix(self, c_matrix): """Runs the Hungarian algorithm with the given cost matrix :param c_matrix: cost matrix, (n_keywords x n_tags)""" row_ind, col_ind = hungarian(c_matrix) query_predictions_for_each_tag = {} for tag, keyword in zip(col_ind, row_ind): query_predictions_for_each_tag[tag] = keyword return query_predictions_for_each_tag
def calc_wasserstein(Dj, Mk): # calculates the 2-Wasserstein L2 distance between two diagrams # inputs: diagram Dj, centre Mk # returns: W_2(Dj, Mk) m = len(Dj) c = calc_cost_matrix(Dj, Mk) X = hungarian(c) total = 0 for i in range(m): total += c[X[0][i]][X[1][i]] return math.sqrt(total)
def maximise_trace(x): """ Maximise the Trace of a SQUARE Matrix X using the Hungarian Algorithm :param x: Numpy 2D SQUARE Array :return: Tuple containing (in order): * optimal permutation of columns to achieve a maximal trace * size of this trace """ _rows, _cols = hungarian(np.full(len(x), np.max(x)) - x) return _cols, x[_rows, _cols].sum()
def maximumANDSum(self, nums, numSlots): """ :type nums: List[int] :type numSlots: int :rtype: int """ # Template translated from: # https://github.com/kth-competitive-programming/kactl/blob/main/content/graph/WeightedMatching.h def hungarian(a): # Time: O(n^2 * m), Space: O(n + m) if not a: return 0, [] n, m = len(a) + 1, len(a[0]) + 1 u, v, p, ans = [0] * n, [0] * m, [0] * m, [0] * (n - 1) for i in xrange(1, n): p[0] = i j0 = 0 # add "dummy" worker 0 dist, pre = [float("inf")] * m, [-1] * m done = [False] * (m + 1) while True: # dijkstra done[j0] = True i0, j1, delta = p[j0], None, float("inf") for j in xrange(1, m): if done[j]: continue cur = a[i0 - 1][j - 1] - u[i0] - v[j] if cur < dist[j]: dist[j], pre[j] = cur, j0 if dist[j] < delta: delta, j1 = dist[j], j for j in xrange(m): if done[j]: u[p[j]] += delta v[j] -= delta else: dist[j] -= delta j0 = j1 if not p[j0]: break while j0: # update alternating path j1 = pre[j0] p[j0], j0 = p[j1], j1 for j in xrange(1, m): if p[j]: ans[p[j] - 1] = j - 1 return -v[0], ans # min cost return -hungarian([[ -((nums[i] if i < len(nums) else 0) & (1 + x // 2)) for x in xrange(2 * numSlots) ] for i in xrange(2 * numSlots)])[0]
def matchIds(bboxes, predbboxes): a = (bboxes[:, 0:2] + bboxes[:, 2:4]) / 2 b = (predbboxes[:, 0:2] + predbboxes[:, 2:4]) / 2 m = np.zeros((bboxes.shape[0], predbboxes.shape[0])) for i, p1 in enumerate(a): for j, p2 in enumerate(b): d = np.sqrt(np.sum((p1 - p2)**2)) m[i, j] = d bbox_ids, pred_ids = hungarian(m) # print(predbboxes.shape, pred_ids.shape) return bbox_ids, predbboxes[pred_ids, 4]
def evaluator(gtr, det_full, parts_name={ 0: 'Tail', 1: 'Head', 2: 'Torax', 3: 'Rant', 4: 'Lant' }, threshold=50, scale=1): # Ground truth as an array nparts = len(parts_name.keys()) ground = {} ground_mappings = {} for fr in gtr['annotations']: ground[fr['image_id']] = [] ground_mappings[fr['image_id']] = [] for fr in gtr['annotations']: if nparts == 3: ground_mappings[fr['image_id']].append([ fr['keypoints'][:2], fr['keypoints'][4:6], distance(fr['keypoints'][:2], fr['keypoints'][2:4]) ]) else: ground_mappings[fr['image_id']].append([ fr['keypoints'][:2], fr['keypoints'][2:4], distance(fr['keypoints'][:2], fr['keypoints'][2:4]) ]) for k in range(0, nparts * 2 - 1, 2): ground[fr['image_id']].append( [fr['keypoints'][k], fr['keypoints'][k + 1]]) # Detection keys as in ground truth detections = {} mappings = {} for key in det_full.keys(): if key == 'runningtime': continue detections[int(key.split("/")[-1].split(".")[0])] = np.array( det_full[key]['detections']) * scale temp = det_full[key]['mapping'] mappings[int(key.split("/")[-1].split(".")[0])] = [] for t in temp: mappings[int(key.split("/")[-1].split(".")[0])].append([ np.array([t[1][0], t[0][0]]) * scale, np.array([t[1][1], t[0][1]]) * scale, t[2], t[3] ]) # Evaluations evaluations = {} retrieval = {} detect = detections for k in ground.keys(): if k not in detect.keys(): continue evaluations[k] = {} retrieval[k] = {} gt = ground[k] gt_map = ground_mappings[k] dt_map = mappings[k] dt = detect[k] cm = cost_matrix(gt, dt, threshold) gt_idx, dt_idx = hungarian(cm) cm_map = cost_matrix_mappings(ground_mappings[k], mappings[k], threshold) gmap_idx, dmap_idx = hungarian(cm_map) assignments_cost = cm[gt_idx, dt_idx] assignments_cost_map = cm_map[gmap_idx, dmap_idx] item = evaluations[k] item['gt_total_parts'] = len(gt) item['gt_total_maps'] = len(gt_map) item['dt_total_parts'] = len(dt) item['dt_total_maps'] = len(dt_map) item['gt_individuals'] = len(gt) // nparts item['dt_individuals'] = len(dt) // nparts item['cumulative_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].sum() item['total_avg_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].mean() item['total_std_error'] = cm[gt_idx[:len(gt)], dt_idx[:len(gt)]].std() item['cumulative_error_map'] = cm[gmap_idx[:len(gt_map)], dmap_idx[:len(gt_map)]].sum() item['avg_error_map'] = cm[gmap_idx[:len(gt_map)], dmap_idx[:len(gt_map)]].mean() item['std_error_map'] = cm[gmap_idx[:len(gt_map)], dmap_idx[:len(gt_map)]].std() #retrieval[k]['cost_matrix'] =cm #retrieval[k]['cost_matrix_map'] =cm_map #retrieval[k]['cost_matrix_assignments'] = assignments_cost #retrieval[k]['cost_matrix_assignments_map'] = assignments_cost_map retrieval[k]['dt_idx'] = dt_idx for i in range(nparts): temp = np.array([ 1 if assignments_cost[j] < threshold else 0 for j in range(i, len(gt), nparts) ]) item[ parts_name[i] + ' score'] = 100 * temp.sum() / evaluations[k]['gt_individuals'] item[parts_name[i] + ' total'] = temp.sum() temp = np.array([ 1 if assignments_cost_map[j] < threshold * 2 else 0 for j in range(len(gt_map)) ]) item['matching_score'] = 100 * temp.sum() / len(gt_map) item['matching_total'] = temp.sum() retrieval['ground'] = ground retrieval['ground_mappings'] = ground_mappings retrieval['detections'] = detections retrieval['mappings'] = mappings return evaluations, retrieval
def recognise(self, frameNumber, detections, bbox, verbose=False): """ Update tracker with new measurements. This is done by calculating a pairwise distance matrix and finding the optimal solution through the Hungarian algorithm. Input: frameNumber: The current frame number (Int) detections: List of cv2.keyPoints (the detections) found in the current frame. bbox: List of dicts containing bounding boxes associated with the detected keypoints. frame: The current frame as numpy array labels: Grayscale image where each BLOB has pixel value equal to its label verbose: Whether to print information or not. Output: tracks: List of Track objects """ for idx in reversed(range(len(bbox))): if bbox[idx]["confidence"] < self.minConfidence: del bbox[idx] del detections[idx] self.detections = detections # Update tracking according to matches numNew = len(self.detections) numOld = len(self.tracks) if (verbose): print("New detections: ", numNew) print("Existing tracks: ", numOld) for t in self.tracks: if verbose: print("ID {} - Kill Count {}".format(t.id, t.killCount)) t.killCount += 1 # Construct cost matrix costM = self.pairwiseDistance(detections, self.tracks) row_ind, col_ind = hungarian(costM) matches = [(row_ind[i], col_ind[i]) for i in range(row_ind.shape[0])] killedTracks = [] for (mRow, pCol) in matches: ## If the assignment cost is below the Ghost threshold, then update the existing tracklet if (costM[mRow][pCol] < self.ghostThreshold): # Update existing track with measurement p = np.array(detections[mRow].pt) self.tracks[pCol].pos.append(p) self.tracks[pCol].bbox.append( self.convertBBoxtoList(bbox[mRow])) self.tracks[pCol].M = self.matrixInverse(bbox[mRow]["cov"]) self.tracks[pCol].mean = bbox[mRow]["mean"] self.tracks[pCol].frame.append(frameNumber) self.tracks[pCol].killCount = 0 ## If the cost assignment is higher than the ghost threshold, then either create a new track or kill an old one else: # A new track is created if the following is true: # 1) The cost (L2 distance) is higher than the ghost threshold # 2) It is an actual detection (mRow < numNew) if (mRow < numNew): # Create new track newTrack = Track() p = np.array(detections[mRow].pt) newTrack.pos.append(p) newTrack.bbox.append(self.convertBBoxtoList(bbox[mRow])) newTrack.M = self.matrixInverse(bbox[mRow]["cov"]) newTrack.mean = bbox[mRow]["mean"] newTrack.frame.append(frameNumber) newTrack.id = self.trackCount self.trackCount += 1 self.tracks.append(newTrack) if verbose: print("Num tracks: {}".format(len(self.tracks))) # The track is deleted if the following is true: # 1) The assigned detection is a dummy detection (mRow >= numNew), # 2) There are more tracks than detections (numOld > numNew) # 3) The assigned track is a real track (pCol < numOld) elif (numOld > numNew and pCol < numOld): if (self.tracks[pCol].killCount > self.maxKillCount): killedTracks.append(pCol) if verbose: print("Num tracks: {}".format(len(self.tracks))) for pCol in sorted(killedTracks, reverse=True): self.oldTracks.append(self.tracks.pop(pCol)) del (costM) if verbose: print()
def gen_assignment_hungarian(trees_a, trees_b): cost = np.array([[tree_b.cost(tree_a) for tree_b in trees_b] for tree_a in trees_a]) row_ind, col_ind = hungarian(cost) pairs = [(trees_a[r], trees_b[c]) for r, c in zip(row_ind, col_ind)] return pairs
def hungarian_tracking(detections_path, cost, output='', part=str(2)): """ This function executes the hungarian algorithm. It is expecting to receive an instance of detections. Please check documentation for data structure. It will also consider the maximum distance and it outputs in a new file with the data structure explained in documentation. Inputs: - detections_path: Path to find detections file - cost : Maximum distance allowed - output: Optional, if '', will use the same path as were the detections are. - part : Over what part perform the tracking. By default thorax or '2' """ if output == '': output = detections_path detections = read_json(detections_path) keylist = list(detections.keys()) tracks = {} final_tracks = {} cmaps = {} tracks = np.zeros((len(keylist), 70)) key = keylist[0] parts = detections[key]['parts'][part] boxes = dets2boxes(parts, size=20) parts = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1]) track_id = {} for j in range(len(parts)): tracks[0][j] = j + 1 track_id[j + 1] = {} track_id[j + 1]['init_frame'] = 0 track_id[j + 1]['cost'] = [0] track_id[j + 1]['positions'] = [parts[j]] for i in range(len(keylist) - 1): key = keylist[i] key_next = keylist[i + 1] parts = detections[key]['parts'][part] boxes = dets2boxes(parts, size=20) parts = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1]) parts_next = detections[key_next]['parts'][part] boxes = dets2boxes(parts_next, size=20) parts_next = boxes2dets(non_max_suppression_slow(boxes, 0.6)[::-1]) cmap = cost_matrix_tracks(parts, parts_next, cost) cmaps[key] = cmap _, idx = hungarian(cmap) for j in range(len(parts)): if cmap[j, idx[j]] < cost: if tracks[i][j] == 0: #create new track for detection j at frame i tracks[i][j] = len(track_id.keys()) + 1 k = int(tracks[i][j]) # id of the track track_id[k] = { 'init_frame': i, 'cost': [0], 'positions': [parts[j]] } tracks[i + 1][idx[j]] = tracks[i][j] track_id[int(tracks[i + 1][idx[j]])]['cost'].append( cmap[j, idx[j]]) track_id[int(tracks[i + 1][idx[j]])]['positions'].append( parts_next[idx[j]]) for k in track_id.keys(): track_id[k]['mean'] = np.array(track_id[k]['cost']).mean() new_tracks = {} new_tracks['frames'] = {} new_tracks['data'] = {} for f in range(len(keylist)): det = detections[keylist[f]]['parts'][part] mapping = detections[keylist[f]]['mapping'] new_tracks['frames'][f] = [] #for j in range(max(tracks[i])): for i in range(len(det)): if int(tracks[f][i]) == 0: continue else: angle = find_angle(det[i], mapping) new_tracks['frames'][f].append(tracks[f][i]) new_tracks['data'][tracks[f][i]] = { 'frame': f, 'id': tracks[f][i], 'location': det[i], 'init_frame': track_id[int(tracks[f][i])]['init_frame'], 'positions': track_id[int(tracks[f][i])]['positions'], 'angle': angle } print('saving trackings') folder = '/'.join(output.split('/')[:-1]) output1 = os.path.join(folder, 'track_nms_v2_' + output.split('/')[-1]) with open(output1, 'w') as outfile: json.dump(new_tracks, outfile, cls=NumpyEncoder) output2 = os.path.join(folder, 'id_nms_track_' + output.split('/')[-1]) with open(output2, 'w') as outfile: json.dump(track_id, outfile, cls=NumpyEncoder) output3 = os.path.join(folder, 'track_nms_' + output.split('/')[-1]) with open(output3, 'w') as outfile: json.dump(tracks, outfile, cls=NumpyEncoder)
def calc_frechet_mean(D, r, k, verbose): # computes the weighted frechet mean of D with weights r[.][k] # inputs: diagrams D, membership values r, centre index k, verbose # returns: weighted frechet mean y, optimal pairings x n = len(D) m = len(D[0]) # initialise to random diagram in D random.seed(0) M_update = D[random.randint(0, n - 1)] # first run to find matching matching = [] for j in range(n): c = calc_cost_matrix(M_update, D[j]) x_indices = hungarian(c) matching.append(x_indices) # loop until stopping condition is found counter2 = 0 while True: counter2 += 1 # update matched points x = np.zeros((n, m, 2)) for j in range(n): for i in range(m): index = matching[j][1][i] x[j][i] = D[j][index] # generate y to return y = np.zeros((m, 2)) # loop over each point for i in range(m): # calculate w and w_\Delta r2_od = 0 r2x_od = [0, 0] for j in range(n): if x[j][i][0] != -1: r2_od += r[j][k]**2 r2x_od[0] += r[j][k]**2 * x[j][i][0] r2x_od[1] += r[j][k]**2 * x[j][i][1] # if all points are diagonals if r2_od == 0: # then y[i] is a diagonal y[i] = [-1, -1] # else proceed else: w = [r2x_od[0] / r2_od, r2x_od[1] / r2_od] w_delta = [(w[0] + w[1]) / 2, (w[0] + w[1]) / 2] r2_d = 0 r2_w_delta = [0, 0] for j in range(n): if x[j][i][0] == -1: r2_d += r[j][k]**2 r2_w_delta[0] += r[j][k]**2 * w_delta[0] r2_w_delta[1] += r[j][k]**2 * w_delta[1] # calculate weighted mean y[i][0] = (r2x_od[0] + r2_w_delta[0]) / (r2_od + r2_d) y[i][1] = (r2x_od[1] + r2_w_delta[1]) / (r2_od + r2_d) old_matching = matching.copy() matching = [] for j in range(n): c = calc_cost_matrix(y, D[j]) x_indices = hungarian(c) matching.append(x_indices) comparison = (np.array(matching) == np.array(old_matching)) if comparison.all(): if verbose: print(" Frechet iterations for M_" + str(k) + ": " + str(counter2)) return y, x
def fit(self, data: pd.DataFrame, seed: Optional[int] = None): # Given a d-dimensional random vector x and its (d,n) observed data matrix X, # apply an ICA algorithm to obtain an estimate of A. d = len(data.columns) B = FastICA(random_state=seed).fit(data).components_ # Find the unique permutation of the rows of W = A^-1 that yields a matrix W' # without any zeros on the main diagonal. The permutation is sought by minimizing # sum_i (1/|W'_ii|). This minimization problem is the classical linear assignment # problem, and here the Hungarian algorithm (Kuhn, 1955) is used. _, K = hungarian(1 / np.abs(B)) B = B.take(K, 0) # Divide each row of W' by its corresponding diagonal element in order to # yield a new matrix W'' with a diagonal consisting entirely of 1s. B /= B.diagonal()[..., None] # Compute an estimate B' of B by using B' = I - W''. B = np.identity(d) - B # Finally, to estimate a causal order k(i), determine the permutation matrix # K of B', obtaining the matrix B' = PB'K^T that is as close as possible # to having a strictly lower triangular structure. K = None if d < 8: # For a small number of variables, i.e., fewer than 8, the lower triangularity # of B' can be measured by using the sum of squared bij in its upper triangular # section sum_i<=j (b'_ij^2). In addition, an exhaustive search over all possible # permutations is feasible and is hence performed. vmin = np.inf for p in permutations(range(d)): score = np.sum(np.square(np.triu(B.take(p, 0)))) if score < vmin: vmin = score K = p K = np.array(K) else: # For higher-dimensional data, the following approximate algorithm is used, # which sets small absolute valued elements in B' to zero, and whereby it can be # determined whether it is possible to permute the resulting matrix to become # strictly lower triangular: # (a) Set the d(d+1)/2 smallest (in absolute value) elements of B' to zero. i = round(d * (d + 1) / 2) pmin = np.argsort(np.abs(np.ravel(B))) B.flat[pmin[:i]] = 0 # (b) Repeat while K is None: # i. Determine whether B' can be permuted to become strictly lower triangular. # If this is possible, stop and return the permuted B'. K, A, L = np.zeros(d, int), np.arange(d), B while len(A) > 0: # Find a row where all elements are zero, if any. j = np.where(np.sum(np.abs(L), axis=1) == 0) # If there is no row with zero elements, exit. if len(j[0]) == 0: K = None break # Select the first row with zero elements. j = j[0][0] # Add original index to permutation matrix. K[d - len(A)] = A[j] A = np.delete(A, j) # Remove selected row and columns. mask = np.delete(np.arange(len(L)), j) L = L[mask][:, mask] # ii. In addition, set the next smallest (in absolute value) element of Bb to zero. if K is None: B.flat[pmin[i]] = 0 i += 1 return K
def _run_algorithm(self, m_matrix, m_obs_matrix, window, tags_by_popularity, brute_force_size=10): """Runs the generalized count attack using the brute force method and Hoefding bounds. Returns a dictionary that maps tag_ids to their assigned keywords (query_predictions_for_each_tag) Returns 0 instead if there is a global inconsistency""" def count_disambiguations(tags0, kws0, candidate_keywords_per_tag, n_kws, n_tags): """Receives an initial (fixed) assignment of tags to keywords and computes how many disambiguations are solved current_map is a dictionary mapping tag to sets of candidate keywords""" current_map = candidate_keywords_per_tag.copy() for tag_id, kw_id in zip(tags0, kws0): current_map[tag_id] = [kw_id] ambiguous_tags = [ tag_id for tag_id, candidates in current_map.items() if len(candidates) > 1 ] known_tags = [ tag_id for tag_id, candidates in current_map.items() if len(candidates) == 1 ] fresh_pass = False while not fresh_pass and len(ambiguous_tags) > 1: fresh_pass = True for tag_id in ambiguous_tags: current_candidates = current_map[tag_id] new_candidates = [] for candidate_kw_id in current_candidates: check = [ m_matrix[current_map[known_tag] [0]][candidate_kw_id] - window <= m_obs_matrix[known_tag][tag_id] <= m_matrix[current_map[known_tag] [0]][candidate_kw_id] + window for known_tag in known_tags ] if all(check): new_candidates.append(candidate_kw_id) if len(current_candidates) > len(new_candidates): fresh_pass = False current_map[tag_id] = new_candidates # print(" Removed {:d} candidates".format(len(current_candidates) - len(new_candidates))) for tag_id in ambiguous_tags: if len(current_map[tag_id]) == 1: ambiguous_tags.remove(tag_id) known_tags.append(tag_id) # print(" tag {:d} is disambiguated".format(tag_id)) for tag_id_others in ambiguous_tags: if tag_id != tag_id_others and current_map[tag_id][ 0] in current_map[tag_id_others]: current_map[tag_id_others].remove( current_map[tag_id][0]) elif len(current_map[tag_id]) == 0: # print(" Inconsistency!") return 0, None # Inconsistency n_disambiguations = len(known_tags) cost_matrix = np.ones((n_kws, n_tags)) for i_tag in range(n_tags): for candidate_kw_id in current_map[i_tag]: cost_matrix[candidate_kw_id, i_tag] = 0 row_ind, col_ind = hungarian(cost_matrix) if cost_matrix[row_ind, col_ind].sum() > 0: # print(" There was no consistent matching!") return 0, None query_predictions_for_each_tag = {} for tag, keyword in zip(col_ind, row_ind): query_predictions_for_each_tag[tag] = keyword # print(" This matching has {:d} disambiguations, returning...".format(n_disambiguations)) return n_disambiguations, query_predictions_for_each_tag assert len(tags_by_popularity) >= brute_force_size # Build candidate sets per tag candidate_keywords_per_tag = {} for tag_id in range(self.n_tags): kw_list = [ kw_id for kw_id in range(self.n_keywords) if m_matrix[kw_id, kw_id] - window <= m_obs_matrix[ tag_id, tag_id] <= m_matrix[kw_id, kw_id] + window ] if len(kw_list) == 0: # print(" tag_{:d} had zero candidates, aborting...".format(tag_id)) return None candidate_keywords_per_tag[tag_id] = kw_list # print("LIST OF CANDIDATE KEYWORDS") # for tag_id in range(self.n_tags): # print("{:d}: len={:d}".format(tag_id, len(candidate_keywords_per_tag[tag_id]))) # Select brute-force sets to test candidate_sets_chosen = [ candidate_keywords_per_tag[tag_id] for tag_id in tags_by_popularity[:brute_force_size] ] aux_combinations = list(itertools.product(*candidate_sets_chosen)) all_combinations_to_test = [ combination for combination in aux_combinations if len(combination) == len(set(combination)) ] if len(all_combinations_to_test) == 0: return None # print("There are {:d} combinations to test".format(len(all_combinations_to_test))) # Compute number of disambiguations in each of those sets test_results = [ count_disambiguations(tags_by_popularity[:brute_force_size], combination, candidate_keywords_per_tag, self.n_keywords, self.n_tags) for combination in all_combinations_to_test ] test_results.sort(key=lambda x: x[0], reverse=True) # Choose output: if test_results[0][ 1] is not None: # If one of these brute-forced matchings was feasible: # print("Found consistent mapping with {:d} disambiguated queries".format(test_results[0][0])) return test_results[0][1] else: # Ensure there is at least one feasible assignment with volumes cost_matrix = np.ones((self.n_keywords, self.n_tags)) for i_tag in range(self.n_tags): for candidate_kw_id in candidate_keywords_per_tag[i_tag]: cost_matrix[candidate_kw_id, i_tag] = 0 row_ind, col_ind = hungarian(cost_matrix) if cost_matrix[row_ind, col_ind].sum() > 0: # print("Could not find any consistent mapping at all...") return None else: # print("Could not find any mapping consistent with co-occurences, returning one that is consistent with volumes...") feasible_assignment = {} for tag, keyword in zip(col_ind, row_ind): feasible_assignment[tag] = keyword return feasible_assignment