def main(): parser = argparse.ArgumentParser( description= 'creates from a binary gensim document metadata file a JSON docid->doctitle mapping file' ) parser.add_argument( '--metadata', type=argparse.FileType('r'), help='path to input document metadata file (.metadata.cpickle)', required=True) parser.add_argument( '--titles', type=argparse.FileType('w'), help='path to output docid->doctitle mapping file (.json)', required=True) args = parser.parse_args() input_metadata_path = args.metadata.name output_titles_path = args.titles.name logger.info('loading metadata from {}'.format(input_metadata_path)) with smart_open(input_metadata_path, "rb") as input_metadata_file: metadata = pickle.load(input_metadata_file) titles = metadata logger.info('saving metadata titles') save_data_to_json(titles, output_titles_path)
def main(): parser = argparse.ArgumentParser( description= 'maps a given partitioning (clustering/communities) file with document labels and a given metadata file with document titles to a doctitle->partitionlabel file' ) parser.add_argument( '--partitions', type=argparse.FileType('r'), help= 'path to input .json.bz2 partitioning file (communities: JSON-dict / clustering: JSON-list)', required=True) parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True) parser.add_argument( '--title-partitions', type=argparse.FileType('w'), help='path to output doctitle->partitionlabel .json file', required=True) args = parser.parse_args() input_partititions_path = args.partitions.name input_titles_path = args.titles.name output_title_partitions_path = args.title_partitions.name logger.info('running with:\n{}'.format( pformat({ 'input_partititions_path': input_partititions_path, 'input_titles_path': input_titles_path, 'output_title_partitions_path': output_title_partitions_path }))) # lade Titel, Partitionierung titles = load_titles(input_titles_path) partitions = load_communities(input_partititions_path) # erzeuge Titel->Partitionslabel-Mapping if isinstance(partitions, dict): # bei Graph-Communities ist Partitionierung dict: bestimme Dok-ID aus Graph-Label des Dokumentes (wie z.B. "d123"), bestimme zug. Dok-Titel title_partitions = { titles[doc_id[1:]]: comm_label for doc_id, comm_label in partitions.items() } else: # bei Clustering ist Partitionierung list: betrachte Index jedes Clusterlabels als Dok-ID, bestimme zug. Dok-Titel title_partitions = { titles[str(doc_id)]: comm_label for doc_id, comm_label in enumerate(partitions) if comm_label >= 0 } logger.info('generated {} title_partitions'.format(len(title_partitions))) logger.debug('title_partitions \n{}'.format(title_partitions)) # speichere Titel->Partitionslabel-Mapping logger.info('saving title communities') save_data_to_json(title_partitions, output_title_partitions_path)
def main(): parser = argparse.ArgumentParser(description='clusters documents of a given document-topics-file by their topics') parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('w'), help='path to output JSON cluster labels file', required=True) cluster_methods = { 'kmeans': 'kmeans algorithm with kmeans++', 'aggl-ward': 'hierarchical agglomerative ward clustering', 'aggl-avg': 'hierarchical agglomerative average clustering', 'aggl-avg-cos': 'hierarchical agglomerative average clustering with cosine distance', } cm = parser.add_argument('--cluster-method', choices=cluster_methods, help='clustering algorithm: ' + str(cluster_methods), required=True) parser.add_argument('--num-clusters', type=int, help='number of clusters to create', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name output_cluster_labels_path = args.cluster_labels.name cluster_method = args.cluster_method num_clusters = args.num_clusters logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'output_cluster_labels_path':output_cluster_labels_path, 'cluster_method':cluster_method, 'num_clusters':num_clusters}))) # lade Dokument-Topic-Matrix logger.info('loading dense document-topics from {}'.format(input_document_topics_path)) document_topics = load_npz(input_document_topics_path) logger.info('loaded document-topics-matrix of shape {}'.format(document_topics.shape)) logger.debug('document-topics-matrix \n{}'.format(document_topics)) # hole Modell zu cluster_method, num_clusters num_docs, num_topics = document_topics.shape logger.info('clustering on {} documents, {} topics'.format(num_docs, num_topics)) cluster_model = get_cluster_model(cluster_method, num_clusters) logger.info('clustering model:\n{}'.format(cluster_model)) # führe Clusteranalyse durch cluster_labels = cluster_model.fit_predict(document_topics) logger.info('{} labels'.format(len(cluster_labels))) logger.debug(cluster_labels) logger.info('{} different labels'.format(len(np.unique(cluster_labels)))) logger.info('{} noise labels'.format((cluster_labels < 0).sum())) # speichere Labels logger.info('saving cluster labels') save_data_to_json(cluster_labels.tolist(), output_cluster_labels_path)
def main(): parser = argparse.ArgumentParser(description='creates a file of clusterings: clusters are sorted descending by size, cluster elements are sorted by distance to cluster centroid') parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True) parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True) parser.add_argument('--centrality-data', type=argparse.FileType('w'), help='path to output .json cluster->centrality_data file', required=True) parser.add_argument('--max-docs-per-clus', type=int, help='maxiumum number of highest considered nodes per cluster', required=True) parser.add_argument('--metric', help='calced dissimilarity to centroids (muse be allowd by cdist of scipy)', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name input_cluster_labels_path = args.cluster_labels.name input_titles_path = args.titles.name output_centrality_data_path = args.centrality_data.name max_docs_per_clus = args.max_docs_per_clus metric = args.metric logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'input_cluster_labels_path':input_cluster_labels_path, 'input_titles_path':input_titles_path, 'output_centrality_data_path':output_centrality_data_path, 'max_docs_per_clus':max_docs_per_clus, 'metric':metric}))) document_topics = load_document_topics(input_document_topics_path) cluster_labels = load_communities(input_cluster_labels_path) document_titles = load_titles(input_titles_path) clusters = get_clusters_from_labels(cluster_labels) logger.info('computing {}-centralities of {} documents in {} communities'.format(metric, len(cluster_labels), len(clusters))) centrality_data = {} for clus_id, cluster in enumerate(clusters): max_doc_ids, centralities = get_top_central_cluster_docs(cluster, document_topics, max_docs_per_clus, metric) logger.debug('max doc ids {}'.format(max_doc_ids)) logger.debug('max doc centralities {}'.format(centralities)) max_doc_titles = get_document_titles(max_doc_ids, document_titles) logger.debug('max titles: {}'.format(max_doc_titles)) centrality_data_of_cluster = { 'size': len(cluster), 'titles': max_doc_titles, 'centralities': centralities } centrality_data[clus_id] = centrality_data_of_cluster logger.info('saving cluster centrality data (titles,centralities) of {} clusters'.format(len(centrality_data))) save_data_to_json(centrality_data, output_centrality_data_path)
def main(): parser = argparse.ArgumentParser( description= 'calculated the most central documents of each community and writes their centrality data (titles,centralities) to a JSON file (exactly min(#nodes of community,J) titles are save per community)' ) parser.add_argument('--coauth-graph', type=argparse.FileType('r'), help='path to output pickled, gzipped graph file', required=True) parser.add_argument('--communities', type=argparse.FileType('r'), help='path to input .json.bz2 communities file', required=True) parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True) parser.add_argument( '--centrality-data', type=argparse.FileType('w'), help='path to output .json community->centrality_data file', required=True) centrality_measures = { 'degree': degree, 'strength': strength, 'betweenness': betweenness, 'closeness': closeness, 'weighted_betweenness': weighted_betweenness, 'weighted_closeness': weighted_closeness } parser.add_argument('--centrality-measure', choices=centrality_measures, help='centrality measure', required=True) parser.add_argument( '--max-docs-per-comm', type=int, help='maxiumum number of highest considered nodes per community', required=True) args = parser.parse_args() input_coauth_graph_path = args.coauth_graph.name input_communities_path = args.communities.name input_titles_path = args.titles.name output_centrality_data_path = args.centrality_data.name centrality_measure = args.centrality_measure max_docs_per_comm = args.max_docs_per_comm logger.info('running with:\n{}'.format( pformat({ 'input_coauth_graph_path': input_coauth_graph_path, 'input_communities_path': input_communities_path, 'input_titles_path': input_titles_path, 'output_centrality_data_path': output_centrality_data_path, 'centrality_measure': centrality_measure, 'max_docs_per_comm': max_docs_per_comm }))) logger.info('loading graph from {}'.format(input_coauth_graph_path)) coauth_graph = Graph.Read_Picklez(input_coauth_graph_path) log_igraph(coauth_graph) communities = load_communities(input_communities_path) titles = load_titles(input_titles_path) # entferne Knoten, die nicht in gespeicherter Communitystruktur auftauchen (z.B. weil nicht in Riesencommunity sind) logger.info('removing nodes of graph without community labels') node_names = coauth_graph.vs['name'] node_names_of_communities = communities.keys() node_names_not_in_communities = set(node_names) - set( node_names_of_communities) coauth_graph.delete_vertices(node_names_not_in_communities) logger.info('graph stats after removing') log_igraph(coauth_graph) logger.info('creating vertex clustering of community labels') node_labels = [communities[name] for name in coauth_graph.vs['name']] community_structure = VertexClustering(coauth_graph, membership=node_labels) logger.debug('created vertex clustering {}'.format(community_structure)) logger.info( 'computing {}-centralities of {} documents in {} communities'.format( centrality_measure, community_structure.n, len(community_structure))) centrality_function = centrality_measures[centrality_measure] centrality_data = {} for comm_id in range(len(community_structure)): comm_subgraph = community_structure.subgraph(comm_id) max_node_names_centralities = get_top_nodes_of_communities( comm_subgraph, max_docs_per_comm, centrality_function) logger.debug( 'max_node_names_weights {}'.format(max_node_names_centralities)) max_node_names, centralities = zip(*max_node_names_centralities) max_doc_titles = get_document_titles_of_node_names( max_node_names, titles) logger.debug('max titles: {}'.format(max_doc_titles)) centrality_data_of_community = { 'size': comm_subgraph.vcount(), 'titles': max_doc_titles, 'centralities': centralities } centrality_data[comm_id] = centrality_data_of_community logger.info( 'saving community centrality data (titles,centralities) of {} communities' .format(len(centrality_data))) save_data_to_json(centrality_data, output_centrality_data_path)
def main(): parser = argparse.ArgumentParser( description='detects communities in a weighted co-authorship-network') parser.add_argument('--coauth-graph', type=argparse.FileType('r'), help='path to output pickled, gzipped graph file', required=True) parser.add_argument('--communities', type=argparse.FileType('w'), help='path to output .json communities file', required=True) methods = { 'greedy': 'fast greedy detection', 'louvain': 'louvain detection' } parser.add_argument('--method', choices=methods, help='community detection method: ' + str(methods), required=True) consider_only_communities = { 'giant': 'consider only subgraph of largest connected component in community detection', 'non-singleton': 'consider only components with of least 2 nodes' } parser.add_argument( '--consider-only-communities', choices=consider_only_communities, help='consider only specific components; options: {}'.format( consider_only_communities)) args = parser.parse_args() input_coauth_graph_path = args.coauth_graph.name output_communities_path = args.communities.name consider_only_communities = args.consider_only_communities method = args.method logger.info('running with:\n{}'.format( pformat({ 'input_coauth_graph_path': input_coauth_graph_path, 'output_communities_path': output_communities_path, 'consider_only_communities': consider_only_communities, 'method': method }))) # lade bipartiten Graph coauth_graph = Graph.Read_Picklez(input_coauth_graph_path) logger.info('read co-authorship graph') log_igraph(coauth_graph) if consider_only_communities is not None: if consider_only_communities == 'giant': # betrachte nur Riesenkomponente logger.info( 'using largest connected component of largest size instead actual graph' ) coauth_graph = coauth_graph.components().giant() elif consider_only_communities == 'non-singleton': # entferne Knoten in 1-Knoten-Community, d.h. Knoten ohne Kanten logger.info('using only non-singleton communities') node_degrees = coauth_graph.degree(coauth_graph.vs) singleton_nodes = [ n for n, deg in enumerate(node_degrees) if deg == 0 ] coauth_graph.delete_vertices(singleton_nodes) logger.info('new network:') log_igraph(coauth_graph) # führe Community-Detection mit Verfahren method durch logger.info('running {} community detection'.format(method)) if method == 'greedy': dendogram = coauth_graph.community_fastgreedy(weights='weight') communities = dendogram.as_clustering() elif method == 'louvain': communities = coauth_graph.community_multilevel(weights='weight') log_communities(communities, coauth_graph) # speichere communities als JSON-Dictionary {Graph-Label: Community-Label} node_names = coauth_graph.vs['name'] node_community_labels = communities.membership name_labeling = dict(zip(node_names, node_community_labels)) logger.info('saving community labels') save_data_to_json(name_labeling, output_communities_path)
def main(): parser = argparse.ArgumentParser( description= 'creates a mapping document title -> list of authors who contributed to this document (in the pruned affiliation network)' ) parser.add_argument( '--bipart-graph', type=argparse.FileType('r'), help= 'path to input pickled networkx bipart graph file (.graph/.graph.bz2)', required=True) parser.add_argument( '--id2author', type=argparse.FileType('r'), help='path to input .txt.bz2 authorid->authorname mapping file', required=True) parser.add_argument( '--titles', type=argparse.FileType('r'), help='path to input .json.bz2 documentid->document title mapping file', required=True) parser.add_argument( '--title2authornames', type=argparse.FileType('w'), help='path to output .json doctitle->authnames mapping file', required=True) args = parser.parse_args() input_bipart_graph_path = args.bipart_graph.name input_id2author_path = args.id2author.name input_titles_path = args.titles.name output_title2authornames_path = args.title2authornames.name logger.info( 'reading bipartite graph from {}'.format(input_bipart_graph_path)) bipart_graph = nx.read_gpickle(input_bipart_graph_path) log_nwx(bipart_graph) logger.info('loading id2author from {}'.format(input_id2author_path)) id2author = Dictionary.load_from_text(input_id2author_path) logger.info('loaded id2author of size {}'.format(len(id2author))) titles = load_titles(input_titles_path) logger.info('generating doctitle->authornames mapping') title2authorname = defaultdict(list) doc_nodes, _ = get_bipartite_nodes(bipart_graph) for doc_node in doc_nodes: doc_id = doc_node[1:] doc_name = titles[doc_id] for author_node in bipart_graph[doc_node]: author_id = int(author_node[1:]) author_name = id2author[author_id] title2authorname[doc_name].append(author_name) num_doctitles = len(title2authorname) num_authornames = sum( len(authornames) for authornames in title2authorname.values()) logger.info( 'generated doctitle->authornames mapping: {} keys, {} entries'.format( num_doctitles, num_authornames)) logger.info('sorting doctitle->authornames mapping') title2authorname = dict(sorted(title2authorname.items())) for authornames in title2authorname.values(): authornames.sort() logger.info('saving doctitle->authornames mapping') save_data_to_json(title2authorname, output_title2authornames_path)