예제 #1
0
def ego_network_clustering(neighbors_fpath,
                           clusters_fpath,
                           max_related=300,
                           num_cores=32):
    global G
    global n
    G = CRSGraph(neighbors_fpath)

    with codecs.open(clusters_fpath, "w",
                     "utf-8") as output, Pool(num_cores) as pool:
        output.write("word\tcid\tcluster\tisas\n")

        for i, ego_network in enumerate(
                pool.imap_unordered(get_ego_network, G.index)):
            if i % 1000 == 0: print(i, "ego networks processed")
            sense_num = 1
            for label, cluster in sorted(
                    aggregate_clusters(ego_network).items(),
                    key=lambda e: len(e[1]),
                    reverse=True):
                output.write("{}\t{}\t{}\t\n".format(
                    ego_network.name, sense_num, ", ".join([
                        "{}:{:.4f}".format(n, w)
                        for w, n in sorted([(
                            ego_network.nodes[c_node]["weight"] / WEIGHT_COEF,
                            c_node) for c_node in cluster],
                                           reverse=True)
                    ])))
                sense_num += 1
    print("Clusters:", clusters_fpath)
예제 #2
0
def main():
    """Entry point for the Chinese Whispers command-line interface."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--weighting', choices=WEIGHTING.keys(), default='lin')
    parser.add_argument('--delimiter', default='\t')
    parser.add_argument('--iterations', type=int, default=20)
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--version',
                        action='version',
                        version='Chinese Whispers v' + version)
    parser.add_argument('edges', type=argparse.FileType('r', encoding='UTF-8'))
    args = parser.parse_args()

    lines = (line.rstrip() for line in args.edges)

    # noinspection PyPep8Naming
    G = nx.parse_edgelist(lines,
                          delimiter=args.delimiter,
                          comments='\n',
                          data=[('weight', float)])

    chinese_whispers(G, args.weighting, args.iterations, args.seed)

    for label, elements in aggregate_clusters(G).items():
        label = str(label)
        length = str(len(elements))
        elements = ', '.join(elements)
        print('\t'.join((label, length, elements)))
예제 #3
0
    def create_g_cluster(self, word_pos):
        words = self.top_k(word_pos)[1:]

        if self.cluster_type < 4:
            pairs = self.gen_pairs(words)
            G = nx.Graph()
            G.add_weighted_edges_from(pairs)

        if self.cluster_type == 3:
            G = max(nx.connected_component_subgraphs(G), key=len)
            print('len_strip(G)', len(G))

        if self.cluster_type == 1:
            from networkx.algorithms.community import greedy_modularity_communities
            clusters = list(greedy_modularity_communities(G))
        elif self.cluster_type == 2:
            from chinese_whispers import chinese_whispers, aggregate_clusters
            chinese_whispers(G, iterations=20, weighting='log', seed=13)  # top, nolog, log
            clusters = aggregate_clusters(G).values()
        elif self.cluster_type == 3:
            from networkx.algorithms.community import asyn_fluidc
            if self.is_k_depends_g:
                clusters = list(asyn_fluidc(G, k=self.k - int((self.k - 8) * ((200 - len(G)) / 100))))
            else:
                clusters = list(asyn_fluidc(G, k=min(self.k, len(G))))
        elif self.cluster_type == 4:
            from collections import defaultdict
            from sklearn.cluster import KMeans

            X = [sg.emb(_) for _ in words[1:]]
            clusters = defaultdict(list)

            kmeans = KMeans(n_clusters=self.k, random_state=13)
            assigned_clusters = kmeans.fit_predict(X)

            for cl, w in zip(assigned_clusters, words): clusters[cl].append(w)
            clusters = list(clusters.values())
        elif self.cluster_type == 5:
            from collections import defaultdict
            from sklearn.cluster import DBSCAN

            X = [sg.emb(_) for _ in words[1:]]
            clusters = defaultdict(list)

            dbscan = DBSCAN(metric='l2', eps=self.min_dist_dbscan, min_samples=self.min_clust)
            assigned_clusters = dbscan.fit_predict(X)

            for cl, w in zip(assigned_clusters, words): clusters[cl].append(w)
            clusters = list(clusters.values())
        else:
            raise Exception('no cluster type', self.cluster_type)

        if self.debug:
            for i, cluster in enumerate(sorted(clusters, key=lambda e: len(e), reverse=True)):
                print('Cluster ID\tCluster Elements\n')
                print('{}\t{}\n'.format(i, cluster))
        print(word_pos, 'clusters', len(clusters))

        return clusters
예제 #4
0
def get_cluster_lines(G, nodes):
    lines = []
    labels_clusters = sorted(aggregate_clusters(G).items(), key=lambda e: len(e[1]), reverse=True)
    for label, cluster in labels_clusters:
        scored_words = []
        for word in cluster:
            scored_words.append( (nodes[word], word) )
        keyword = sorted(scored_words, reverse=True)[0][1]
        
        lines.append("{}\t{}\t{}\t{}\n".format(G.name, label, keyword, ", ".join(cluster)))
        
    return lines 
예제 #5
0
    def _get_cluster_lines_(graph, nodes):
        """Writes clusters into a csv-file line."""
        lines = []
        labels_clusters = sorted(aggregate_clusters(graph).items(),
                                 key=lambda e: len(e[1]),
                                 reverse=True)
        for label, cluster in labels_clusters:
            scored_words = []
            for word in cluster:
                scored_words.append((nodes[word], word))
            keyword = sorted(scored_words, reverse=True)[0][1]

            lines.append("{}\t{}\t{}\t{}\n".format(graph.name, label, keyword,
                                                   ", ".join(cluster)))
        return lines
예제 #6
0
                words[index2triple[j]] = float(d)

        for target, distance in words.most_common(args.neighbors):
            G.add_edge(source, target, weight=distance)
            maximal_distance = distance if distance > maximal_distance else maximal_distance

for _, _, d in G.edges(data=True):
    d['weight'] = maximal_distance / d['weight']

if args.pickle is not None:
    import pickle

    pickle.dump(list(G.edges(data=True)), args.pickle, protocol=3)
    sys.exit(0)

chinese_whispers(G, weighting='top', iterations=20)
clusters = aggregate_clusters(G)

for label, cluster in sorted(aggregate_clusters(G).items(),
                             key=lambda e: len(e[1]),
                             reverse=True):
    print('# Cluster %d\n' % label)

    subjects = {subject for subject, _, _ in cluster}
    predicates = {predicate for _, predicate, _ in cluster}
    objects = {object for _, _, object in cluster}

    print('Predicates: %s' % ', '.join(predicates))
    print('Subjects: %s' % ', '.join(subjects))
    print('Objects: %s\n' % ', '.join(objects))
예제 #7
0
def apply_distributional_semantics(nx_graph,
                                   taxonomy,
                                   domain,
                                   mode,
                                   exclude_parent,
                                   exclude_family,
                                   new_nodes=[]):
    # Load the pre-trained vectors
    print('Loading embeddings...')
    poincare_w2v, own_w2v = load_vectors()
    print('Loaded.')

    print('\n\nApplying distributional semantics...')
    output_dir = 'out'
    g_improved = nx_graph.copy()

    if mode == 'ds':
        print('\nReattaching new nodes...')
        g_cluster = create_children_clusters(own_w2v, g_improved)
        count = 0
        for node in new_nodes:
            max_score = 0
            max_score_node = ''
            for p_node, graph in g_cluster.items():
                gc = chinese_whispers(graph, weighting='top', iterations=60)
                for _, family in aggregate_clusters(gc).items():
                    score = calculate_similarity(poincare_w2v, own_w2v, p_node,
                                                 family, node, exclude_parent,
                                                 exclude_family)
                    if score > max_score:
                        max_score = score
                        max_score_node = p_node
            if max_score_node == '':
                count += 1
            g_improved.add_edge(max_score_node, node)
        print('Done.')
        print(count)
    # elif mode == 'root':
    #     root = domain.split('_')[0]
    #     for node in new_nodes:
    #         g_improved.add_edge(root, node)

    # Tune the result
    g_improved = tune_result(g_improved)
    print('Tuned.')

    # Save the results after each iteration and display the F1 score
    output_path = save_result(g_improved, taxonomy)

    # Prune and clean the generated taxonomy
    pruned_output = graph_pruning(output_path, output_dir, domain)

    # Display the F1 score for the generated taxonomy

    scores = calculate_f1_score(pruned_output, output_dir, domain)

    # Write the scores of each iteration in a CSV file
    with open(
            os.path.join(output_dir, os.path.basename(taxonomy)) +
            '-iter-records.csv', 'w') as f:
        f.write('precision,recall,f1,f_m\n')
        f.write('{precision},{recall},{f1},{f_m}\n'.format(
            precision=scores['precision'],
            recall=scores['recall'],
            f1=scores['f1'],
            f_m=scores['f_m']))