コード例 #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'creates from a binary gensim document metadata file a JSON docid->doctitle mapping file'
    )
    parser.add_argument(
        '--metadata',
        type=argparse.FileType('r'),
        help='path to input document metadata file (.metadata.cpickle)',
        required=True)
    parser.add_argument(
        '--titles',
        type=argparse.FileType('w'),
        help='path to output docid->doctitle mapping file (.json)',
        required=True)

    args = parser.parse_args()
    input_metadata_path = args.metadata.name
    output_titles_path = args.titles.name

    logger.info('loading metadata from {}'.format(input_metadata_path))
    with smart_open(input_metadata_path, "rb") as input_metadata_file:
        metadata = pickle.load(input_metadata_file)
    titles = metadata
    logger.info('saving metadata titles')
    save_data_to_json(titles, output_titles_path)
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'maps a given partitioning (clustering/communities) file with document labels and a given metadata file with document titles to a doctitle->partitionlabel file'
    )
    parser.add_argument(
        '--partitions',
        type=argparse.FileType('r'),
        help=
        'path to input .json.bz2 partitioning file (communities: JSON-dict / clustering: JSON-list)',
        required=True)
    parser.add_argument('--titles',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 titles file',
                        required=True)
    parser.add_argument(
        '--title-partitions',
        type=argparse.FileType('w'),
        help='path to output doctitle->partitionlabel .json file',
        required=True)

    args = parser.parse_args()
    input_partititions_path = args.partitions.name
    input_titles_path = args.titles.name
    output_title_partitions_path = args.title_partitions.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_partititions_path': input_partititions_path,
            'input_titles_path': input_titles_path,
            'output_title_partitions_path': output_title_partitions_path
        })))

    # lade Titel, Partitionierung
    titles = load_titles(input_titles_path)
    partitions = load_communities(input_partititions_path)

    # erzeuge Titel->Partitionslabel-Mapping
    if isinstance(partitions, dict):
        # bei Graph-Communities ist Partitionierung dict: bestimme Dok-ID aus Graph-Label des Dokumentes (wie z.B. "d123"), bestimme zug. Dok-Titel
        title_partitions = {
            titles[doc_id[1:]]: comm_label
            for doc_id, comm_label in partitions.items()
        }
    else:
        # bei Clustering ist Partitionierung list: betrachte Index jedes Clusterlabels als Dok-ID, bestimme zug. Dok-Titel
        title_partitions = {
            titles[str(doc_id)]: comm_label
            for doc_id, comm_label in enumerate(partitions) if comm_label >= 0
        }
    logger.info('generated {} title_partitions'.format(len(title_partitions)))
    logger.debug('title_partitions \n{}'.format(title_partitions))

    # speichere Titel->Partitionslabel-Mapping
    logger.info('saving title communities')
    save_data_to_json(title_partitions, output_title_partitions_path)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(description='clusters documents of a given document-topics-file by their topics')
    parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('w'), help='path to output JSON cluster labels file', required=True)
    cluster_methods = {
        'kmeans': 'kmeans algorithm with kmeans++',
        'aggl-ward': 'hierarchical agglomerative ward clustering',
        'aggl-avg': 'hierarchical agglomerative average clustering',
        'aggl-avg-cos': 'hierarchical agglomerative average clustering with cosine distance',
    }
    cm = parser.add_argument('--cluster-method', choices=cluster_methods, help='clustering algorithm: ' + str(cluster_methods), required=True)
    parser.add_argument('--num-clusters', type=int, help='number of clusters to create', required=True)
    
    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    output_cluster_labels_path = args.cluster_labels.name
    cluster_method = args.cluster_method
    num_clusters = args.num_clusters
    
    logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'output_cluster_labels_path':output_cluster_labels_path, 'cluster_method':cluster_method, 'num_clusters':num_clusters})))
           
    # lade Dokument-Topic-Matrix
    logger.info('loading dense document-topics from {}'.format(input_document_topics_path))
    document_topics = load_npz(input_document_topics_path)
    logger.info('loaded document-topics-matrix of shape {}'.format(document_topics.shape))
    logger.debug('document-topics-matrix \n{}'.format(document_topics))
    
    # hole Modell zu cluster_method, num_clusters
    num_docs, num_topics = document_topics.shape
    logger.info('clustering on {} documents, {} topics'.format(num_docs, num_topics))
    cluster_model = get_cluster_model(cluster_method, num_clusters)
    logger.info('clustering model:\n{}'.format(cluster_model))
    
    # führe Clusteranalyse durch
    cluster_labels = cluster_model.fit_predict(document_topics)
    logger.info('{} labels'.format(len(cluster_labels)))
    logger.debug(cluster_labels)
    logger.info('{} different labels'.format(len(np.unique(cluster_labels))))
    logger.info('{} noise labels'.format((cluster_labels < 0).sum()))
    
    # speichere Labels
    logger.info('saving cluster labels')
    save_data_to_json(cluster_labels.tolist(), output_cluster_labels_path)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description='creates a file of clusterings: clusters are sorted descending by size, cluster elements are sorted by distance to cluster centroid')    
    parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True)
    parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True)  
    parser.add_argument('--centrality-data', type=argparse.FileType('w'), help='path to output .json cluster->centrality_data file', required=True)
    parser.add_argument('--max-docs-per-clus', type=int, help='maxiumum number of highest considered nodes per cluster', required=True)
    parser.add_argument('--metric', help='calced dissimilarity to centroids (muse be allowd by cdist of scipy)', required=True)
    
    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    max_docs_per_clus = args.max_docs_per_clus
    metric = args.metric
    
    logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'input_cluster_labels_path':input_cluster_labels_path, 'input_titles_path':input_titles_path, 'output_centrality_data_path':output_centrality_data_path, 'max_docs_per_clus':max_docs_per_clus, 'metric':metric})))
        
    document_topics = load_document_topics(input_document_topics_path)
    cluster_labels = load_communities(input_cluster_labels_path)
    document_titles = load_titles(input_titles_path)
        
    clusters = get_clusters_from_labels(cluster_labels)    
    logger.info('computing {}-centralities of {} documents in {} communities'.format(metric, len(cluster_labels), len(clusters)))
    centrality_data = {}
    for clus_id, cluster in enumerate(clusters):
        max_doc_ids, centralities = get_top_central_cluster_docs(cluster, document_topics, max_docs_per_clus, metric)
        logger.debug('max doc ids {}'.format(max_doc_ids))
        logger.debug('max doc centralities {}'.format(centralities))
        max_doc_titles = get_document_titles(max_doc_ids, document_titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_cluster = {
            'size': len(cluster),
            'titles': max_doc_titles, 
            'centralities': centralities
        }
        centrality_data[clus_id] = centrality_data_of_cluster
    
    logger.info('saving cluster centrality data (titles,centralities) of {} clusters'.format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculated the most central documents of each community and writes their centrality data (titles,centralities) to a JSON file (exactly min(#nodes of community,J) titles are save per community)'
    )
    parser.add_argument('--coauth-graph',
                        type=argparse.FileType('r'),
                        help='path to output pickled, gzipped graph file',
                        required=True)
    parser.add_argument('--communities',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 communities file',
                        required=True)
    parser.add_argument('--titles',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 titles file',
                        required=True)
    parser.add_argument(
        '--centrality-data',
        type=argparse.FileType('w'),
        help='path to output .json community->centrality_data file',
        required=True)
    centrality_measures = {
        'degree': degree,
        'strength': strength,
        'betweenness': betweenness,
        'closeness': closeness,
        'weighted_betweenness': weighted_betweenness,
        'weighted_closeness': weighted_closeness
    }
    parser.add_argument('--centrality-measure',
                        choices=centrality_measures,
                        help='centrality measure',
                        required=True)
    parser.add_argument(
        '--max-docs-per-comm',
        type=int,
        help='maxiumum number of highest considered nodes per community',
        required=True)

    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    input_communities_path = args.communities.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    centrality_measure = args.centrality_measure
    max_docs_per_comm = args.max_docs_per_comm

    logger.info('running with:\n{}'.format(
        pformat({
            'input_coauth_graph_path': input_coauth_graph_path,
            'input_communities_path': input_communities_path,
            'input_titles_path': input_titles_path,
            'output_centrality_data_path': output_centrality_data_path,
            'centrality_measure': centrality_measure,
            'max_docs_per_comm': max_docs_per_comm
        })))

    logger.info('loading graph from {}'.format(input_coauth_graph_path))
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    log_igraph(coauth_graph)

    communities = load_communities(input_communities_path)
    titles = load_titles(input_titles_path)

    # entferne Knoten, die nicht in gespeicherter Communitystruktur auftauchen (z.B. weil nicht in Riesencommunity sind)
    logger.info('removing nodes of graph without community labels')
    node_names = coauth_graph.vs['name']
    node_names_of_communities = communities.keys()
    node_names_not_in_communities = set(node_names) - set(
        node_names_of_communities)
    coauth_graph.delete_vertices(node_names_not_in_communities)
    logger.info('graph stats after removing')
    log_igraph(coauth_graph)

    logger.info('creating vertex clustering of community labels')
    node_labels = [communities[name] for name in coauth_graph.vs['name']]
    community_structure = VertexClustering(coauth_graph,
                                           membership=node_labels)
    logger.debug('created vertex clustering {}'.format(community_structure))

    logger.info(
        'computing {}-centralities of {} documents in {} communities'.format(
            centrality_measure, community_structure.n,
            len(community_structure)))
    centrality_function = centrality_measures[centrality_measure]
    centrality_data = {}
    for comm_id in range(len(community_structure)):
        comm_subgraph = community_structure.subgraph(comm_id)
        max_node_names_centralities = get_top_nodes_of_communities(
            comm_subgraph, max_docs_per_comm, centrality_function)
        logger.debug(
            'max_node_names_weights {}'.format(max_node_names_centralities))
        max_node_names, centralities = zip(*max_node_names_centralities)
        max_doc_titles = get_document_titles_of_node_names(
            max_node_names, titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_community = {
            'size': comm_subgraph.vcount(),
            'titles': max_doc_titles,
            'centralities': centralities
        }
        centrality_data[comm_id] = centrality_data_of_community

    logger.info(
        'saving community centrality data (titles,centralities) of {} communities'
        .format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(
        description='detects communities in a weighted co-authorship-network')
    parser.add_argument('--coauth-graph',
                        type=argparse.FileType('r'),
                        help='path to output pickled, gzipped graph file',
                        required=True)
    parser.add_argument('--communities',
                        type=argparse.FileType('w'),
                        help='path to output .json communities file',
                        required=True)
    methods = {
        'greedy': 'fast greedy detection',
        'louvain': 'louvain detection'
    }
    parser.add_argument('--method',
                        choices=methods,
                        help='community detection method: ' + str(methods),
                        required=True)
    consider_only_communities = {
        'giant':
        'consider only subgraph of largest connected component in community detection',
        'non-singleton': 'consider only components with of least 2 nodes'
    }
    parser.add_argument(
        '--consider-only-communities',
        choices=consider_only_communities,
        help='consider only specific components; options: {}'.format(
            consider_only_communities))

    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    output_communities_path = args.communities.name
    consider_only_communities = args.consider_only_communities
    method = args.method

    logger.info('running with:\n{}'.format(
        pformat({
            'input_coauth_graph_path': input_coauth_graph_path,
            'output_communities_path': output_communities_path,
            'consider_only_communities': consider_only_communities,
            'method': method
        })))

    # lade bipartiten Graph
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    logger.info('read co-authorship graph')
    log_igraph(coauth_graph)

    if consider_only_communities is not None:
        if consider_only_communities == 'giant':
            # betrachte nur Riesenkomponente
            logger.info(
                'using largest connected component of largest size instead actual graph'
            )
            coauth_graph = coauth_graph.components().giant()
        elif consider_only_communities == 'non-singleton':
            # entferne Knoten in 1-Knoten-Community, d.h. Knoten ohne Kanten
            logger.info('using only non-singleton communities')
            node_degrees = coauth_graph.degree(coauth_graph.vs)
            singleton_nodes = [
                n for n, deg in enumerate(node_degrees) if deg == 0
            ]
            coauth_graph.delete_vertices(singleton_nodes)
        logger.info('new network:')
        log_igraph(coauth_graph)

    # führe Community-Detection mit Verfahren method durch
    logger.info('running {} community detection'.format(method))
    if method == 'greedy':
        dendogram = coauth_graph.community_fastgreedy(weights='weight')
        communities = dendogram.as_clustering()
    elif method == 'louvain':
        communities = coauth_graph.community_multilevel(weights='weight')
    log_communities(communities, coauth_graph)

    # speichere communities als JSON-Dictionary {Graph-Label: Community-Label}
    node_names = coauth_graph.vs['name']
    node_community_labels = communities.membership
    name_labeling = dict(zip(node_names, node_community_labels))
    logger.info('saving community labels')
    save_data_to_json(name_labeling, output_communities_path)
コード例 #7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'creates a mapping document title -> list of authors who contributed to this document (in the pruned affiliation network)'
    )
    parser.add_argument(
        '--bipart-graph',
        type=argparse.FileType('r'),
        help=
        'path to input pickled networkx bipart graph file (.graph/.graph.bz2)',
        required=True)
    parser.add_argument(
        '--id2author',
        type=argparse.FileType('r'),
        help='path to input .txt.bz2 authorid->authorname mapping file',
        required=True)
    parser.add_argument(
        '--titles',
        type=argparse.FileType('r'),
        help='path to input .json.bz2 documentid->document title mapping file',
        required=True)
    parser.add_argument(
        '--title2authornames',
        type=argparse.FileType('w'),
        help='path to output .json doctitle->authnames mapping file',
        required=True)

    args = parser.parse_args()
    input_bipart_graph_path = args.bipart_graph.name
    input_id2author_path = args.id2author.name
    input_titles_path = args.titles.name
    output_title2authornames_path = args.title2authornames.name

    logger.info(
        'reading bipartite graph from {}'.format(input_bipart_graph_path))
    bipart_graph = nx.read_gpickle(input_bipart_graph_path)
    log_nwx(bipart_graph)

    logger.info('loading id2author from {}'.format(input_id2author_path))
    id2author = Dictionary.load_from_text(input_id2author_path)
    logger.info('loaded id2author of size {}'.format(len(id2author)))

    titles = load_titles(input_titles_path)

    logger.info('generating doctitle->authornames mapping')
    title2authorname = defaultdict(list)
    doc_nodes, _ = get_bipartite_nodes(bipart_graph)
    for doc_node in doc_nodes:
        doc_id = doc_node[1:]
        doc_name = titles[doc_id]
        for author_node in bipart_graph[doc_node]:
            author_id = int(author_node[1:])
            author_name = id2author[author_id]
            title2authorname[doc_name].append(author_name)
    num_doctitles = len(title2authorname)
    num_authornames = sum(
        len(authornames) for authornames in title2authorname.values())
    logger.info(
        'generated doctitle->authornames mapping: {} keys, {} entries'.format(
            num_doctitles, num_authornames))

    logger.info('sorting doctitle->authornames mapping')
    title2authorname = dict(sorted(title2authorname.items()))
    for authornames in title2authorname.values():
        authornames.sort()

    logger.info('saving doctitle->authornames mapping')
    save_data_to_json(title2authorname, output_title2authornames_path)