def connected_components_diff(game, player): """ Difference between number of connected components of one player and its opponent :param game: :param player: :return: """ size = game.width * game.height uf = UnionFind(size) blank = game.get_blank_spaces() for bs in blank: for n in neighbors(game, bs): uf.union(bs, n) player_location = game.get_player_location(player) opp_location = game.get_player_location(game.get_opponent(player)) for n in neighbors(game, player_location): uf.union(n, player_location) for n in neighbors(game, opp_location): uf.union(n, opp_location) pl_score = float(uf.components(player_location)) op_score = float(uf.components(opp_location)) return pl_score - op_score
def UnionFindCommunity(self, G): Nodes = G.nodes() uf = UnionFind(Nodes) for source, target in G.edges(): uf.union(source, target) components = uf.components() score = [] for nodes in components: score.append(nodes) self.addGNodesAttr(G, score, "Union find")
def trip_roster_merged(trip_roster_file, colname_file, trip_chain, park_pair_file, gas_pair_file): col_names = pd.read_csv(colname_file) trip_roster = pd.read_csv(trip_roster_file, header=None, names=col_names.columns) if trip_chain == False: matched_trip_pair = pd.read_csv(park_pair_file) matched_trip_pair.columns = [ 'TripId', 'StopId', 'EndTripId', 'StartTripId' ] matched_trip_pair = matched_trip_pair[[ 'EndTripId', 'StartTripId', 'StopId' ]].append(pd.read_csv(gas_pair_file)) trip_pair_id = ['EndTripId', 'StopId', 'StartTripId'] new_pair_df = matched_trip_pair else: matched_trip_pair = pd.read_csv(park_pair_file, usecols=['end', 'start']) matched_trip_pair.columns = ['EndTripId', 'StartTripId'] matched_trip_pair = matched_trip_pair.append( pd.read_csv(gas_pair_file, usecols=['EndTripId', 'StartTripId'])) trip_pair_id = ['EndTripId', 'StartTripId'] # trip chaining start_time = time.time() uf = UnionFind(list(set(matched_trip_pair.values.flatten()))) for index, row in matched_trip_pair.iterrows(): uf.union(row['EndTripId'], row['StartTripId']) result = uf.components() print('Trip chaining takes %s secs for %s trip pairs.' % (time.time() - start_time, len(matched_trip_pair))) def set_first_last(input_set): tmp = list(input_set) return [tmp[0], tmp[-1]] new_pair = map(set_first_last, result) new_pair_df = pd.DataFrame(new_pair, columns=['EndTripId', 'StartTripId']) trip_unmatched = trip_roster.loc[~trip_roster['TripId'].isin( matched_trip_pair[['EndTripId', 'StartTripId']].values.flatten())] # create od file for matched trips/ trip chain trip_od = new_pair_df.merge( trip_roster[['TripId', 'StartLocLat', 'StartLocLon']].rename(columns={'TripId': 'EndTripId'}), how='left', sort=False) trip_od = trip_od.merge( trip_roster[['TripId', 'EndLocLat', 'EndLocLon']].rename(columns={'TripId': 'StartTripId'}), how='left', sort=False) trip_od['TripId'] = trip_od[trip_pair_id].apply( lambda row: '_'.join(row.tolist()), axis=1) return trip_unmatched, trip_od
def cluster(edges, start_at): uf = UnionFind() sortede = sorted(edges, key=lambda x: x[3]) for k in range(1, 501): uf.add(k) for _, u, v, w in sortede: if len(list(uf.components())) == 4: break elif not uf.connected(u, v): uf.union(u, v) find_minimal(uf, edges)
def cluster(nodes): uf = UnionFind() one_diff, two_diff = one_two_away() for v in nodes: uf.add(v) for vindex, v in enumerate(nodes): # squash all the nodes with distance 1 away into me od = [(v ^ i) for i in one_diff] td = [(v ^ i) for i in two_diff] for d in od + td: if d in nodes and not uf.connected(v, d): uf.union(v, d) print("smashed", v, d) print(len(list(uf.components())))
def sample_config(config, eta, N, no_colors, sites, param_name, curr_params, uf=None, cluster_constraints=None): if uf == None and not only_averages: '''Generate clusters from the assigned bonds (eta_edge)''' uf = UnionFind(sites) eta_edges = eta[1] for i in range(eta_edges.shape[0]): for j in range(eta_edges.shape[1]): for e in range(eta_edges.shape[2]): if eta_edges[i, j, e] == -1: continue if eta_edges[i, j, e] == 0: if e == 0: uf.union(site2str((i, j)), site2str((i + 1, j))) elif e == 1: uf.union(site2str((i, j)), site2str((i, j + 1))) '''For each cluster, find the site with strongest constraint (smallest eta_site) and assign that eta_site to the entire cluster ''' eta_sites = eta[2] cluster_constraints = {} cl_n = 0 cls = np.zeros((eta_sites.shape[0], eta_sites.shape[1]), dtype=np.uint16) # up to 255x255 box for cluster in uf.components(): cl_n += 1 min_constraint = no_colors cluster_root = '-1,-1' for site_str in cluster: site = str2site(site_str) if eta_sites[site[0], site[1]] <= min_constraint: min_constraint = eta_sites[site[0], site[1]] cluster_root = site_str cls[site[0], site[1]] = cl_n cluster_constraints[cluster_root] = min_constraint cluster_constraints[cl_n] = min_constraint if prt: print('clusters formed by bonds (eta_edge):') if prt: print(cls) # Case with no field and gamma > 0 if (curr_params['alpha'] == 0 and curr_params['gamma'] > 0): if prt: print('Case with no field and gamma > 0') '''Choose exactly how many colors to use in the configuration''' max_colors = eta[0] # eta_lambda prob_k = [] if only_averages: no_cl = N * N else: no_cl = len(uf.components()) for k in range(1, max_colors + 1): prob_k.append(P_exact_cols(no_colors, k, no_cl, S)) if prt: print('prob_k:', prob_k) prob_k = [pk / sum(prob_k) for pk in prob_k] prob_k = np.array(prob_k) if prt: print('prob_k:', prob_k) exact_k = np.random.choice((np.arange(1, max_colors + 1)), p=prob_k) if prt: print('exact number of colors to use in configuration:') if prt: print(exact_k) if only_averages: '''Only compute average in case with zero field and no interaction, but with gamma > 0, use an arbitray partition (avoid computing Bell polynomials) ''' chosen_colors = range(1, exact_k + 1) if exact_k == no_colors: chosen_partition = [exact_k] else: chosen_partition = [ exact_k - 1 ] + [0] * max(0, no_colors - exact_k - 1) + [1] part = [] for i in range(len(chosen_partition)): part += ([i + 1] * chosen_partition[i]) chosen_partition = part if prt: print('chosen partition:') if prt: print(chosen_partition) color_arr = [] for i in range(len(chosen_partition)): color_arr += [chosen_colors[i]] * chosen_partition[i] config = np.array(color_arr).reshape((N, N)) return config, None, None '''Sample a partition of the no. of clusters into k blocks using Bell polynomials''' partition_dict = {} if ((no_cl, exact_k) in bell_dict): partition_dict = bell_dict[(no_cl, exact_k)] if prt: print("Bell found", (no_cl, exact_k)) else: session.evaluate("subs = Array[x," + str(no_cl - exact_k + 1) + "]") partition_dict = session.evaluate( "Association@CoefficientRules[BellY[" + str(no_cl) + ", " + str(exact_k) + ", subs], subs]") bell_dict[(no_cl, exact_k)] = partition_dict if prt: print("Bell computed", (no_cl, exact_k)) if prt: print("partition_dict", partition_dict) partitions = [] partition_p = [] for partition in partition_dict: partitions.append(partition) partition_p.append(partition_dict[partition]) if prt: print('partition_p:', partition_p) partition_p = [pk / sum(partition_p) for pk in partition_p] parition_p = np.array(partition_p) if prt: print('partition_p:', partition_p) chosen_partition = partitions[np.random.choice( (np.arange(0, len(partitions))), p=partition_p)] if prt: print('chosen partition:') if prt: print(chosen_partition) # Transform to actual partition part = [] for i in range(len(chosen_partition)): part += ([i + 1] * chosen_partition[i]) chosen_partition = part if prt: print('chosen partition:') if prt: print(chosen_partition) '''Color each block in the partition randomly without replacement''' # Choose the colors to be used chosen_colors = np.random.choice((np.arange(1, no_colors + 1)), len(chosen_partition), replace=False) # Choose a random permutation of the given word color_arr = [] for i in range(len(chosen_partition)): color_arr += [chosen_colors[i]] * chosen_partition[i] color_arr = np.array(color_arr) color_arr = np.random.permutation(color_arr) if prt: print('colors for clusters:') if prt: print(color_arr) '''Color each cluster with the assigned color''' i = 0 for root in cluster_constraints: cluster_color = color_arr[i] for site_str in uf.component(root): site = str2site(site_str) config[site[0], site[1]] = cluster_color i += 1 # Case with field and gamma > 0 or case with gamma = 0 else: '''Randomly sample a color for each cluster''' if prt: print('Case with field and gamma > 0 or case with gamma = 0') config = brute_force_sample(cluster_constraints, cls, uf, config) return config, uf, cluster_constraints
def remove_duplicates_instance_to_mask(mask, class_ids, score, PX_TH=20, SC_TH=0.3): mask_resize = mask[::5, ::5, :] iou_matrix = compute_iou_masksets_partial(mask_resize, mask_resize) uf = UnionFind(list(range(mask.shape[2]))) overlap = list() for i in range(mask.shape[2]): for j in range(mask.shape[2]): if i == j: continue else: if iou_matrix[i, j] > 0.8 and class_ids[i] == class_ids[j]: uf.union(i, j) overlap.append(i) overlap.append(j) overlap = np.unique(overlap) keep = [] for n in range(iou_matrix.shape[0]): if n not in overlap: keep.append(n) # print('keep', keep) mask_instance_new = mask[:, :, keep] class_ids_new = list(class_ids[keep]) score_new = list(score[keep]) merged_sets = [] for n, pair in enumerate(uf.components()): if len(pair) >= 2: merged_sets.append(pair) mask_instance_merged = np.zeros( [mask.shape[0], mask.shape[1], len(merged_sets)], dtype=bool) for n, pair in enumerate(merged_sets): mask_instance_merged[:, :, n] = np.zeros([mask.shape[0], mask.shape[1]], dtype=bool) scores_this_set = [] index_this_set = [] class_id_this_set = [] px_num_this_set = [] for p in pair: scores_this_set.append(score[p]) index_this_set.append(p) class_id_this_set.append(class_ids[p]) px_num_this_set.append(np.sum(mask[:, :, p], axis=(0, 1))) index = np.argmax(np.array(scores_this_set)) mask_instance_merged[:, :, n] = mask[:, :, index_this_set[index]] class_ids_new.append(class_id_this_set[index]) score_new.append(scores_this_set[index]) # print('before', mask_instance_new.shape) mask_instance_new = np.dstack((mask_instance_new, mask_instance_merged)) # mask_instance_new = mask_instance_merged class_ids_pred = np.array(class_ids_new) scores_pred = np.array(score_new) # print('after', mask_instance_new.shape) n_px_per_instance = np.sum(mask_instance_new, axis=(0, 1)) instance_keep = np.where( np.logical_and((n_px_per_instance > PX_TH), (scores_pred > SC_TH)))[0] if len(instance_keep) == 0: return None, None # print(instance_keep) instance_reorder = instance_keep[np.argsort(scores_pred[instance_keep])] # print(instance_reorder) score_reorder = scores_pred[instance_reorder] class_ids_reorder = class_ids_pred[instance_reorder] mask_reorder = mask_instance_new[:, :, instance_reorder] mask_reorder = remove_disconnected(mask_reorder) # print(mask_reorder.shape) # mask_reorder = fill_and_remove(mask_reorder) mask, instance_score = instance_to_mask(mask_reorder, class_ids_reorder, score_reorder, order_by_score=False) return mask, instance_score
class GMM: def __init__(self, n_clusters, n_steps, eps=1e-20): self.n_clusters = n_clusters self.n_steps = n_steps self.eps = eps def _initialize(self): """ Initializes self.alpha, self.mu, self.sigma, self.w """ self.alpha = np.ones((self.n_clusters)) / self.n_clusters self.mu = self.X[np.random.choice(np.arange(self.n), self.n_clusters)] self.sigma = np.ones((self.n_clusters, self.d)) self.chunklet_w = np.zeros((self.n_chunklets, self.n_clusters)) #centers = init_centers(X, self.n_clusters) #dists = cdist(X, centers) #labels = np.argmin(dists, axis=1) #unq_labels, self.alpha = np.unique(labels, return_counts=True) #self.alpha = np.zeros(self.n_clusters) #self.mu = np.zeros((self.n_clusters, d)) # Using diagonal variance #self.sigma = np.zeros((self.n_clusters, d)) # for i, lbl in enumerate(unq_labels): # cur_pts = np.where(labels == lbl) # self.alpha[i] = cur_pts[0].shape[0] # # initialize means # self.mu[i, :] = np.mean(X[cur_pts], axis=0) # centered = (X[cur_pts] - self.mu[i])**2 # centered = np.sum(centered, axis=0) / centered.shape[0] # # initialize vars # self.sigma[i, :] = self.alpha[i] * centered #self.alpha /= n # self._validate_sigma() #self.chunklet_w = np.zeros((self.chunklets.shape[0], self.n_clusters)) def _transitive_closure(self): self.uf = UnionFind(np.arange(self.n)) for link in self.ml: self.uf.union(link[0], link[1]) self.chunklets = np.array( [np.array(list(i)) for i in self.uf.components()]) self.n_chunklets = self.chunklets.shape[0] self.chunklet_shapes = np.array([i.shape[0] for i in self.chunklets]) self.chunklet_shapes = self.chunklet_shapes.reshape(-1, 1) self.chunklet_means = np.array( [np.mean(self.X[i], axis=0) for i in self.chunklets]) assert self.chunklet_means.shape == (self.n_chunklets, self.d) def fit(self, X, ml): self.n = X.shape[0] self.d = X.shape[1] self.X = X.copy() self.ml = ml.copy() self._transitive_closure() self._initialize() self.scores = [] self.lls = [] for step in range(self.n_steps): self.e_step() self.m_step() self.scores.append(self.score()) self.lls.append(self.ll) print(f"Step {step+1} :: LL {self.ll} :: Score {self.scores[-1]}") if len(self.lls) >= 2 and np.abs(self.lls[-1] - self.lls[-2]) < 1e-2: print("Converged") break def get_labels(self): chunk_labels = np.argmax(self.chunklet_w, axis=1).astype(np.int) labels = np.zeros(self.n) for i, chunk in enumerate(self.chunklets): labels[chunk] = chunk_labels[i] return labels.astype(np.int) def llhood(self): ll = 0 for i, chunklet in enumerate(self.chunklets): for j in range(self.n_clusters): numerator = mn.pdf( self.X[chunklet], self.mu[j], np.diag(self.sigma[j])) ll += np.sum(np.log(numerator + self.eps), axis=0) *\ self.chunklet_w[i,j] ll += np.log(self.alpha[j] + self.eps) * self.chunklet_w[i,j] return ll def e_step(self): self.ll = 0 for i, chunklet in enumerate(self.chunklets): denominator = 0 numerators = [] for j in range(self.n_clusters): numerator = mn.pdf( self.X[chunklet], self.mu[j], np.diag(self.sigma[j])) self.ll += np.sum(np.log(numerator + self.eps), axis=0) *\ self.chunklet_w[i,j] self.ll += np.log(self.alpha[j] + self.eps) *\ self.chunklet_w[i,j] numerator = np.prod(numerator, axis=0) numerator *= self.alpha[j] denominator += numerator self.chunklet_w[i, j] = numerator self.chunklet_w[i, :] /= (denominator + self.eps) #assert np.abs(self.chunklet_w[i, :].sum() - 1) < eps,\ # np.abs(self.chunklet_w[i, :].sum()) def m_step(self): self.alpha = self.chunklet_w.sum(axis=0) / self.n_chunklets for j in range(self.n_clusters): den = 0 temp_mu = np.zeros((1, self.d)) numfrac = self.chunklet_w[:, j, np.newaxis] * self.chunklet_shapes den = np.sum(numfrac, axis=0, keepdims=True) temp_mu = np.sum(self.chunklet_means * numfrac, axis=0) self.mu[j] = temp_mu / den diff_sq = (self.X - self.mu[j])**2 temp_sigma = np.zeros((1, self.d)) for i in range(self.n_chunklets): # calc sigmanew signew = diff_sq[self.chunklets[i]] signew = np.sum(signew, axis=0, keepdims=True) signew /= self.chunklet_shapes[i] temp_sigma += signew * numfrac[i] self.sigma[j] = temp_sigma / den def score(self): labels = self.get_labels() return silhouette_score(self.X, labels)
def gen_model(dataset_name, ignore_wo_url=False, ignore_replies=False): event_data, missing_urls_amount = load_data(dataset_name) ########## # create set of tweet_ids # for a given tweet t: # if t does not have urls: add a tweet_id {t.id}_0 # for each url_i in t: add a tweet_id {t.id}_{i} # for each url_i in t: add a tweet_id {t.reply_id}_{i} ########## tweet_ids = set() logging.info("create list of tweet_ids") for tweet_id, tweet in tqdm(event_data.items(), total=len(event_data)): added = False if not tweet.expanded_urls: if not ignore_wo_url: tweet_ids.add(f'{tweet_id}_0') added = True else: for i, url in enumerate(tweet.expanded_urls.values()): tweet_ids.add(f'{tweet_id}_{i}') added = True if added and tweet.reply_id != 'NULL': if tweet.reply_id in event_data and not ignore_replies: for i, url in enumerate(tweet.expanded_urls.values()): tweet_ids.add(f'{tweet.reply_id}_{i}') ########## # for each tweet_id in the set of tweet_ids # add a pair ########## logging.info( "create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'" ) replies_amount = 0 retweets_amount = 0 quotes_amount = 0 missing_replies_amount = 0 pairs = [] for tweet_id in tweet_ids: frags = tweet_id.split('_') o_tweet_id = frags[0] i = int(frags[1]) tweet = event_data[o_tweet_id] url = tweet.expanded_urls.get(i) if url: pairs.append((tweet_id, url)) # retweets ARE considered, due to be exact text copies of the retweeted tweet if tweet.retweet_id != 'NULL': retweets_amount += 1 if tweet.quote_id != 'NULL': quotes_amount += 1 if tweet.reply_id != 'NULL': replies_amount += 1 if tweet.reply_id in event_data: if not ignore_replies: ## TODO esto esta bien? pairs.append((tweet_id, f'{tweet.reply_id}_{i}')) else: missing_replies_amount += 1 logging.info( f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} ' f'(missing: {missing_replies_amount}, missing urls: {missing_urls_amount})' ) ########## """ all keys must be the same time (in this case, strings); unionfind will vectorize operations and will cast everything in the array to the same type, so if there are integers and strings, it will cast everything to string and comparisons will fail when calling uf.components(). """ logging.info('applying union-find') uf = UnionFind() for u, v in pairs: uf.union(u, v) logging.info(f'total components: {len(uf.components())}') logging.info('\n') return {'components': uf.components(), 'event_data': event_data}