def ego_network_clustering(neighbors_fpath, clusters_fpath, max_related=300, num_cores=32): global G global n G = CRSGraph(neighbors_fpath) with codecs.open(clusters_fpath, "w", "utf-8") as output, Pool(num_cores) as pool: output.write("word\tcid\tcluster\tisas\n") for i, ego_network in enumerate( pool.imap_unordered(get_ego_network, G.index)): if i % 1000 == 0: print(i, "ego networks processed") sense_num = 1 for label, cluster in sorted( aggregate_clusters(ego_network).items(), key=lambda e: len(e[1]), reverse=True): output.write("{}\t{}\t{}\t\n".format( ego_network.name, sense_num, ", ".join([ "{}:{:.4f}".format(n, w) for w, n in sorted([( ego_network.nodes[c_node]["weight"] / WEIGHT_COEF, c_node) for c_node in cluster], reverse=True) ]))) sense_num += 1 print("Clusters:", clusters_fpath)
def main(): """Entry point for the Chinese Whispers command-line interface.""" parser = argparse.ArgumentParser() parser.add_argument('--weighting', choices=WEIGHTING.keys(), default='lin') parser.add_argument('--delimiter', default='\t') parser.add_argument('--iterations', type=int, default=20) parser.add_argument('--seed', type=int, default=None) parser.add_argument('--version', action='version', version='Chinese Whispers v' + version) parser.add_argument('edges', type=argparse.FileType('r', encoding='UTF-8')) args = parser.parse_args() lines = (line.rstrip() for line in args.edges) # noinspection PyPep8Naming G = nx.parse_edgelist(lines, delimiter=args.delimiter, comments='\n', data=[('weight', float)]) chinese_whispers(G, args.weighting, args.iterations, args.seed) for label, elements in aggregate_clusters(G).items(): label = str(label) length = str(len(elements)) elements = ', '.join(elements) print('\t'.join((label, length, elements)))
def create_g_cluster(self, word_pos): words = self.top_k(word_pos)[1:] if self.cluster_type < 4: pairs = self.gen_pairs(words) G = nx.Graph() G.add_weighted_edges_from(pairs) if self.cluster_type == 3: G = max(nx.connected_component_subgraphs(G), key=len) print('len_strip(G)', len(G)) if self.cluster_type == 1: from networkx.algorithms.community import greedy_modularity_communities clusters = list(greedy_modularity_communities(G)) elif self.cluster_type == 2: from chinese_whispers import chinese_whispers, aggregate_clusters chinese_whispers(G, iterations=20, weighting='log', seed=13) # top, nolog, log clusters = aggregate_clusters(G).values() elif self.cluster_type == 3: from networkx.algorithms.community import asyn_fluidc if self.is_k_depends_g: clusters = list(asyn_fluidc(G, k=self.k - int((self.k - 8) * ((200 - len(G)) / 100)))) else: clusters = list(asyn_fluidc(G, k=min(self.k, len(G)))) elif self.cluster_type == 4: from collections import defaultdict from sklearn.cluster import KMeans X = [sg.emb(_) for _ in words[1:]] clusters = defaultdict(list) kmeans = KMeans(n_clusters=self.k, random_state=13) assigned_clusters = kmeans.fit_predict(X) for cl, w in zip(assigned_clusters, words): clusters[cl].append(w) clusters = list(clusters.values()) elif self.cluster_type == 5: from collections import defaultdict from sklearn.cluster import DBSCAN X = [sg.emb(_) for _ in words[1:]] clusters = defaultdict(list) dbscan = DBSCAN(metric='l2', eps=self.min_dist_dbscan, min_samples=self.min_clust) assigned_clusters = dbscan.fit_predict(X) for cl, w in zip(assigned_clusters, words): clusters[cl].append(w) clusters = list(clusters.values()) else: raise Exception('no cluster type', self.cluster_type) if self.debug: for i, cluster in enumerate(sorted(clusters, key=lambda e: len(e), reverse=True)): print('Cluster ID\tCluster Elements\n') print('{}\t{}\n'.format(i, cluster)) print(word_pos, 'clusters', len(clusters)) return clusters
def get_cluster_lines(G, nodes): lines = [] labels_clusters = sorted(aggregate_clusters(G).items(), key=lambda e: len(e[1]), reverse=True) for label, cluster in labels_clusters: scored_words = [] for word in cluster: scored_words.append( (nodes[word], word) ) keyword = sorted(scored_words, reverse=True)[0][1] lines.append("{}\t{}\t{}\t{}\n".format(G.name, label, keyword, ", ".join(cluster))) return lines
def _get_cluster_lines_(graph, nodes): """Writes clusters into a csv-file line.""" lines = [] labels_clusters = sorted(aggregate_clusters(graph).items(), key=lambda e: len(e[1]), reverse=True) for label, cluster in labels_clusters: scored_words = [] for word in cluster: scored_words.append((nodes[word], word)) keyword = sorted(scored_words, reverse=True)[0][1] lines.append("{}\t{}\t{}\t{}\n".format(graph.name, label, keyword, ", ".join(cluster))) return lines
words[index2triple[j]] = float(d) for target, distance in words.most_common(args.neighbors): G.add_edge(source, target, weight=distance) maximal_distance = distance if distance > maximal_distance else maximal_distance for _, _, d in G.edges(data=True): d['weight'] = maximal_distance / d['weight'] if args.pickle is not None: import pickle pickle.dump(list(G.edges(data=True)), args.pickle, protocol=3) sys.exit(0) chinese_whispers(G, weighting='top', iterations=20) clusters = aggregate_clusters(G) for label, cluster in sorted(aggregate_clusters(G).items(), key=lambda e: len(e[1]), reverse=True): print('# Cluster %d\n' % label) subjects = {subject for subject, _, _ in cluster} predicates = {predicate for _, predicate, _ in cluster} objects = {object for _, _, object in cluster} print('Predicates: %s' % ', '.join(predicates)) print('Subjects: %s' % ', '.join(subjects)) print('Objects: %s\n' % ', '.join(objects))
def apply_distributional_semantics(nx_graph, taxonomy, domain, mode, exclude_parent, exclude_family, new_nodes=[]): # Load the pre-trained vectors print('Loading embeddings...') poincare_w2v, own_w2v = load_vectors() print('Loaded.') print('\n\nApplying distributional semantics...') output_dir = 'out' g_improved = nx_graph.copy() if mode == 'ds': print('\nReattaching new nodes...') g_cluster = create_children_clusters(own_w2v, g_improved) count = 0 for node in new_nodes: max_score = 0 max_score_node = '' for p_node, graph in g_cluster.items(): gc = chinese_whispers(graph, weighting='top', iterations=60) for _, family in aggregate_clusters(gc).items(): score = calculate_similarity(poincare_w2v, own_w2v, p_node, family, node, exclude_parent, exclude_family) if score > max_score: max_score = score max_score_node = p_node if max_score_node == '': count += 1 g_improved.add_edge(max_score_node, node) print('Done.') print(count) # elif mode == 'root': # root = domain.split('_')[0] # for node in new_nodes: # g_improved.add_edge(root, node) # Tune the result g_improved = tune_result(g_improved) print('Tuned.') # Save the results after each iteration and display the F1 score output_path = save_result(g_improved, taxonomy) # Prune and clean the generated taxonomy pruned_output = graph_pruning(output_path, output_dir, domain) # Display the F1 score for the generated taxonomy scores = calculate_f1_score(pruned_output, output_dir, domain) # Write the scores of each iteration in a CSV file with open( os.path.join(output_dir, os.path.basename(taxonomy)) + '-iter-records.csv', 'w') as f: f.write('precision,recall,f1,f_m\n') f.write('{precision},{recall},{f1},{f_m}\n'.format( precision=scores['precision'], recall=scores['recall'], f1=scores['f1'], f_m=scores['f_m']))