def main():
    parser = argparse.ArgumentParser(description='calculated various centrality-related stats (only the giant component of the graph considered!')
    parser.add_argument('--coauth-graph', type=argparse.FileType('r'), help='path to output pickled, gzipped graph file', required=True)
    parser.add_argument('--communities', type=argparse.FileType('r'), help='path to input .json.bz2 communities file', required=True)
    parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True)
    parser.add_argument('--K', type=int, help='number of considered, equaldistand communites 0,floor(1*(N-1)/K),...,N-1', required=True)
    parser.add_argument('--J', type=int, help='maxiumum number of highest considered nodes per community', required=True)
    
    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    input_communities_path = args.communities.name
    input_titles_path = args.titles.name
    K = args.K
    J = args.J
    
    logger.info('running with:\n{}'.format(pformat({'input_coauth_graph_path':input_coauth_graph_path, 'input_communities_path':input_communities_path, 'input_titles_path':input_titles_path, 'K':K, 'J':J})))
    
    logger.info('loading graph from {}'.format(input_coauth_graph_path))
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    logger.info('using largest connected component of largest size instead actual graph')
    coauth_graph = coauth_graph.components().giant()
    log_igraph(coauth_graph)
    
    communities = load_communities(input_communities_path)
    titles = load_titles(input_titles_path)
    
    logger.info('creating vertex clustering of community labels')
    node_labels = [communities[name] for name in coauth_graph.vs['name']]
    community_structure = VertexClustering(coauth_graph, membership=node_labels)
    logger.debug('created vertex clustering {}'.format(community_structure))
        
    community_sizes = list(enumerate(community_structure.sizes()))
    community_sizes.sort(key=lambda t:t[1], reverse=True)
    logger.debug('community sizes, sorted descending\n{}'.format(community_sizes))
        
    logger.info('filtering to communities of at least {} nodes'.format(J))
    community_sizes = [(commid,size) for commid,size in community_sizes if size >= J]
    logger.info('filtered to {} communities'.format(len(community_sizes)))
        
    N = len(community_sizes)
    logger.info('calculating considered communities number of communites N={}, considering K={} equidistant communities'.format(N, K))
    community_indices = [math.floor(k*(N-1)/(K-1)) for k in range(0,K)]
    logger.info('considering indices {}'.format(community_indices))
    considered_communities = [community_sizes[i] for i in community_indices]
    logger.info('considering communities (id,size): {}'.format(considered_communities))
      
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, degree)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, strength)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, betweenness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, weighted_betweenness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, closeness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, weighted_closeness)
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'plots the connected components size distribution of a given graph')
    parser.add_argument('--graph',
                        type=argparse.FileType('r'),
                        help='path to input pickled .cpickle.gz graph file ',
                        required=True)
    parser.add_argument('--img',
                        type=argparse.FileType('w'),
                        help='path of output img file',
                        required=True)
    parser.add_argument('--quantile-order',
                        type=float,
                        help='quantile of histrograms to consider',
                        required=True)

    args = parser.parse_args()
    input_graph_path = args.graph.name
    output_img_path = args.img.name
    quantile_order = args.quantile_order

    logger.info('running with:\n{}'.format(
        pformat({
            'input_graph_path': input_graph_path,
            'output_img_path': output_img_path,
            'quantile_order': quantile_order
        })))

    graph = Graph.Read_Picklez(input_graph_path)
    logger.info('loaded graph')
    log_igraph(graph)

    logger.info('calculating connected components')
    components = graph.components()
    logger.debug(components)
    components_sizes = np.array([len(comp) for comp in components])
    logger.info('max component size {}'.format(components_sizes.max()))

    logger.info(
        'component size distribution of {}-quantile'.format(quantile_order))
    sizes, sizes_counts = np.unique(components_sizes, return_counts=True)
    logger.info('frequencies:\n{}'.format(np.vstack((sizes, sizes_counts)).T))

    quantile = get_quantile(components_sizes, quantile_order)
    components_sizes = components_sizes[components_sizes <= quantile]
    sizes, sizes_counts = np.unique(components_sizes, return_counts=True)
    xlabel = 'Knotenanzahl Zusammenhangskomponente'
    ylabel = 'Häufigkeit'
    bar_plot(sizes, sizes_counts, output_img_path, xlabel, ylabel)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'converts a weighted pickled networkx Graph to a pickled igraph graph')
    parser.add_argument(
        '--nwx',
        type=argparse.FileType('r'),
        help='path to input pickled networkx graph file (.graph/.graph.bz2)',
        required=True)
    parser.add_argument(
        '--igraph',
        type=argparse.FileType('w'),
        help='path to output pickled gzipped igraph file (.graph.gz)',
        required=True)

    args = parser.parse_args()
    input_nwx_path = args.nwx.name
    output_igraph_path = args.igraph.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_nwx_path': input_nwx_path,
            'output_igraph_path': output_igraph_path
        })))

    # lade gewichteten NetworkX-Graph
    logger.info('reading networkx graph from {}'.format(input_nwx_path))
    nwx_graph = nx.read_gpickle(input_nwx_path)
    log_nwx(nwx_graph)

    # erzeuge gewichteten Igraph-Graph
    logger.info('converting read networkx graph to igraph graph')
    weighted_edges = nwx_graph.edges(data='weight')
    node_name_ids = {node: id for id, node in enumerate(nwx_graph.nodes())}
    edge_weights = (((n1, n2), w) for n1, n2, w in weighted_edges)
    edges, weights = zip(*edge_weights)
    edges = [(node_name_ids[n1], node_name_ids[n2]) for n1, n2 in edges]
    igraph_graph = Graph(n=len(node_name_ids),
                         edges=list(edges),
                         directed=False,
                         vertex_attrs={'name': list(nwx_graph.nodes())},
                         edge_attrs={'weight': list(weights)})

    log_igraph(igraph_graph)

    # speichere Igraph-Graph
    logger.info('writing graph to {}'.format(output_igraph_path))
    igraph_graph.write_picklez(fname=output_igraph_path)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculated the most central documents of each community and writes their centrality data (titles,centralities) to a JSON file (exactly min(#nodes of community,J) titles are save per community)'
    )
    parser.add_argument('--coauth-graph',
                        type=argparse.FileType('r'),
                        help='path to output pickled, gzipped graph file',
                        required=True)
    parser.add_argument('--communities',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 communities file',
                        required=True)
    parser.add_argument('--titles',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 titles file',
                        required=True)
    parser.add_argument(
        '--centrality-data',
        type=argparse.FileType('w'),
        help='path to output .json community->centrality_data file',
        required=True)
    centrality_measures = {
        'degree': degree,
        'strength': strength,
        'betweenness': betweenness,
        'closeness': closeness,
        'weighted_betweenness': weighted_betweenness,
        'weighted_closeness': weighted_closeness
    }
    parser.add_argument('--centrality-measure',
                        choices=centrality_measures,
                        help='centrality measure',
                        required=True)
    parser.add_argument(
        '--max-docs-per-comm',
        type=int,
        help='maxiumum number of highest considered nodes per community',
        required=True)

    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    input_communities_path = args.communities.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    centrality_measure = args.centrality_measure
    max_docs_per_comm = args.max_docs_per_comm

    logger.info('running with:\n{}'.format(
        pformat({
            'input_coauth_graph_path': input_coauth_graph_path,
            'input_communities_path': input_communities_path,
            'input_titles_path': input_titles_path,
            'output_centrality_data_path': output_centrality_data_path,
            'centrality_measure': centrality_measure,
            'max_docs_per_comm': max_docs_per_comm
        })))

    logger.info('loading graph from {}'.format(input_coauth_graph_path))
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    log_igraph(coauth_graph)

    communities = load_communities(input_communities_path)
    titles = load_titles(input_titles_path)

    # entferne Knoten, die nicht in gespeicherter Communitystruktur auftauchen (z.B. weil nicht in Riesencommunity sind)
    logger.info('removing nodes of graph without community labels')
    node_names = coauth_graph.vs['name']
    node_names_of_communities = communities.keys()
    node_names_not_in_communities = set(node_names) - set(
        node_names_of_communities)
    coauth_graph.delete_vertices(node_names_not_in_communities)
    logger.info('graph stats after removing')
    log_igraph(coauth_graph)

    logger.info('creating vertex clustering of community labels')
    node_labels = [communities[name] for name in coauth_graph.vs['name']]
    community_structure = VertexClustering(coauth_graph,
                                           membership=node_labels)
    logger.debug('created vertex clustering {}'.format(community_structure))

    logger.info(
        'computing {}-centralities of {} documents in {} communities'.format(
            centrality_measure, community_structure.n,
            len(community_structure)))
    centrality_function = centrality_measures[centrality_measure]
    centrality_data = {}
    for comm_id in range(len(community_structure)):
        comm_subgraph = community_structure.subgraph(comm_id)
        max_node_names_centralities = get_top_nodes_of_communities(
            comm_subgraph, max_docs_per_comm, centrality_function)
        logger.debug(
            'max_node_names_weights {}'.format(max_node_names_centralities))
        max_node_names, centralities = zip(*max_node_names_centralities)
        max_doc_titles = get_document_titles_of_node_names(
            max_node_names, titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_community = {
            'size': comm_subgraph.vcount(),
            'titles': max_doc_titles,
            'centralities': centralities
        }
        centrality_data[comm_id] = centrality_data_of_community

    logger.info(
        'saving community centrality data (titles,centralities) of {} communities'
        .format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description='detects communities in a weighted co-authorship-network')
    parser.add_argument('--coauth-graph',
                        type=argparse.FileType('r'),
                        help='path to output pickled, gzipped graph file',
                        required=True)
    parser.add_argument('--communities',
                        type=argparse.FileType('w'),
                        help='path to output .json communities file',
                        required=True)
    methods = {
        'greedy': 'fast greedy detection',
        'louvain': 'louvain detection'
    }
    parser.add_argument('--method',
                        choices=methods,
                        help='community detection method: ' + str(methods),
                        required=True)
    consider_only_communities = {
        'giant':
        'consider only subgraph of largest connected component in community detection',
        'non-singleton': 'consider only components with of least 2 nodes'
    }
    parser.add_argument(
        '--consider-only-communities',
        choices=consider_only_communities,
        help='consider only specific components; options: {}'.format(
            consider_only_communities))

    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    output_communities_path = args.communities.name
    consider_only_communities = args.consider_only_communities
    method = args.method

    logger.info('running with:\n{}'.format(
        pformat({
            'input_coauth_graph_path': input_coauth_graph_path,
            'output_communities_path': output_communities_path,
            'consider_only_communities': consider_only_communities,
            'method': method
        })))

    # lade bipartiten Graph
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    logger.info('read co-authorship graph')
    log_igraph(coauth_graph)

    if consider_only_communities is not None:
        if consider_only_communities == 'giant':
            # betrachte nur Riesenkomponente
            logger.info(
                'using largest connected component of largest size instead actual graph'
            )
            coauth_graph = coauth_graph.components().giant()
        elif consider_only_communities == 'non-singleton':
            # entferne Knoten in 1-Knoten-Community, d.h. Knoten ohne Kanten
            logger.info('using only non-singleton communities')
            node_degrees = coauth_graph.degree(coauth_graph.vs)
            singleton_nodes = [
                n for n, deg in enumerate(node_degrees) if deg == 0
            ]
            coauth_graph.delete_vertices(singleton_nodes)
        logger.info('new network:')
        log_igraph(coauth_graph)

    # führe Community-Detection mit Verfahren method durch
    logger.info('running {} community detection'.format(method))
    if method == 'greedy':
        dendogram = coauth_graph.community_fastgreedy(weights='weight')
        communities = dendogram.as_clustering()
    elif method == 'louvain':
        communities = coauth_graph.community_multilevel(weights='weight')
    log_communities(communities, coauth_graph)

    # speichere communities als JSON-Dictionary {Graph-Label: Community-Label}
    node_names = coauth_graph.vs['name']
    node_community_labels = communities.membership
    name_labeling = dict(zip(node_names, node_community_labels))
    logger.info('saving community labels')
    save_data_to_json(name_labeling, output_communities_path)