def cd_cluster(config, source_folder, target_folder): source_filename = filename_for_pp_config( **{ **config, "seed": None, "markov_time": None, "number_of_modules": None, "consensus": None, "method": None, }, file_ext=source_file_ext, ) g = nx.read_gpickle(f"{source_folder}/{source_filename}") g = compile_source_graph(g, config["method"]) if not config["consensus"]: clustering, D = cluster(g, config, return_tree=True) tree_path = (target_folder + "/" + filename_for_pp_config(**config, file_ext=".gpickle.gz")) nx.write_gpickle(D, tree_path) else: clustering = consensus_clustering(g, config) target_filename = filename_for_pp_config(**config, file_ext="target_file_ext") clustering = missings_nodes_as_additional_clusters(clustering) target_filename = filename_for_pp_config(**config, file_ext=target_file_ext) write_community_json(clustering, f"{target_folder}/{target_filename}")
def cd_cluster_evolution_inspection_prepare( overwrite, cluster_mapping_configs, source_folder, crossreference_graph_folder, target_folder, ): ensure_exists(target_folder) configs = get_configs(cluster_mapping_configs) existing_files = set(list_dir(target_folder, ".htm")) if not overwrite: configs = [ config for config in configs if filename_for_pp_config(snapshot="all", **config, file_ext=".htm") not in existing_files ] if configs: global cd_cluster_evolution_inspection_graphs cd_cluster_evolution_inspection_graphs = { f[:-len(".gpickle.gz")]: hierarchy_graph( nx.read_gpickle(os.path.join(crossreference_graph_folder, f))) for f in list_dir(crossreference_graph_folder, ".gpickle.gz") } return configs
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config( **{ **item, "seed": None, "markov_time": None, "number_of_modules": None, "consensus": None, "method": None, }, file_ext=source_file_ext, ) for item in items } check_for_missing_files(required_source_files, existing_source_files, "preprocessed graphs") if not overwrite: existing_files = list_dir(target_folder, target_file_ext) items = get_no_overwrite_items(items, target_file_ext, existing_files) return items
def get_configs_no_overwrite(configs, existing_files): configs = [ config for config in configs if filename_for_pp_config(snapshot="all", **config, file_ext=".gpickle.gz") not in existing_files ] return configs
def cd_preprocessing(config, source_folder, target_folder, decision_network_path): source_path = f"{source_folder}/seqitems/{config['snapshot']}.gpickle.gz" graph_target_path = ( f"{target_folder}/{filename_for_pp_config(**config, file_ext=target_file_ext)}" ) missing_nodes_target_path = os.path.join( target_folder, filename_for_pp_config(**config, file_ext="_missing_co_occurr_nodes.csv"), ) seq_decay_func = decay_function(config["pp_decay"]) G = nx.read_gpickle(source_path) # Remove authority edges G.remove_edges_from( [ (u, v, k) for u, v, k, d in G.edges(keys=True, data="edge_type") if d == "authority" ] ) mqG, nodes_mapping = quotient_graph_with_merge( G, merge_threshold=config["pp_merge"] ) smqG = sequence_graph( mqG, seq_decay_func=seq_decay_func, seq_ref_ratio=config["pp_ratio"] ) check_missing_edges(mqG, smqG) if config["pp_co_occurrence"] != 0: missing_nodes = add_co_occurrences( config, smqG, G, nodes_mapping, decision_network_path ) pd.DataFrame( list(missing_nodes.items()), columns=["missing_node", "count"] ).sort_values("count", ascending=False).to_csv( missing_nodes_target_path, index=False ) if config["pp_co_occurrence"] == -1: edges_to_remove = [ (u, v, k) for u, v, k, t in smqG.edges(keys=True, data="edge_type") if t == "reference" ] smqG.remove_edges_from(edges_to_remove) nx.write_gpickle(smqG, graph_target_path)
def cd_cluster_evolution_inspection(config, dataset, source_folder, target_folder): global cd_cluster_evolution_inspection_graphs source_filename_base = filename_for_pp_config(snapshot="all", **config, file_ext="") G = nx.read_gpickle( os.path.join(source_folder, source_filename_base + ".gpickle.gz")) families = cluster_families(G, 0.15) destination = f"{target_folder}/{source_filename_base}.htm" generate_inspection(G, families, destination)
def get_clustering_result(cluster_path, dataset, graph_type, path_prefix="", regulations=False): """ read the clustering result and the respective graph. ::param cluster_path: path of the cdlib.readwrite.write_community_json output ::param dataset: 'de' or 'us' ::param graph_type: 'clustering' for the rolled up graph. Other options: subseqitems, seqitems """ filename_base = os.path.splitext(os.path.split(cluster_path)[-1])[0] snapshot = filename_base.split("_")[0] if graph_type == "clustering": config = get_config_from_filename(filename_base) graph_filename = filename_for_pp_config( **simplify_config_for_preprocessed_graph(config)) graph_path = path_prefix + ( (US_REG_CD_PREPROCESSED_GRAPH_PATH if regulations else US_CD_PREPROCESSED_GRAPH_PATH) if dataset.lower() == "us" else (DE_REG_CD_PREPROCESSED_GRAPH_PATH if regulations else DE_CD_PREPROCESSED_GRAPH_PATH)) graph_path += f"/{graph_filename}" G = nx.read_gpickle(graph_path) elif graph_type in ["seqitems", "subseqitems"]: graph_path = path_prefix + ( (US_REG_CROSSREFERENCE_GRAPH_PATH if regulations else US_CROSSREFERENCE_GRAPH_PATH) if dataset.lower() == "us" else (DE_REG_CROSSREFERENCE_GRAPH_PATH if regulations else DE_CROSSREFERENCE_GRAPH_PATH)) graph_path += f"/{graph_type}/{snapshot}.gpickle.gz" G = nx.read_gpickle(graph_path) else: raise Exception(f"graph_type {graph_type} not allowed") clustering = readwrite.read_community_json(path_prefix + ( (US_REG_CD_CLUSTER_PATH if regulations else US_CD_CLUSTER_PATH ) if dataset.lower() == "us" else (DE_REG_CD_CLUSTER_PATH if regulations else DE_CD_CLUSTER_PATH)) + "/" + os.path.split(cluster_path)[-1]) clustering.graph = G add_communities_to_graph(clustering) return clustering
def test_filename_for_pp_config(self): self.assertEqual( filename_for_pp_config(*config_dict_to_list(self.config)), self.filename, ) other_config = self.config.copy() other_config["method"] = "louvain" self.assertEqual( filename_for_pp_config(*config_dict_to_list(other_config)), self.other_filename, ) for attr in [ "pp_co_occurrence", "method", "number_of_modules", "markov_time", "seed", "consensus", ]: other_config[attr] = None self.assertEqual( filename_for_pp_config(*config_dict_to_list(other_config)), self.simple_filename, )
def get_config_clustering_files(config, source_folder): """ get all clusterings for a given config. (Multiple snapshots to be mapped) ::return filenames, snapshots """ existing_clustering = set(list_dir(source_folder, ".json")) config_filename_part = filename_for_pp_config(snapshot="", **config, file_ext=".json") config_clustering_files = sorted( [x for x in existing_clustering if x.endswith(config_filename_part)]) snapshots = sorted([ config_clustering_file.split("_")[0] for config_clustering_file in config_clustering_files ]) return config_clustering_files, snapshots
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config(**item, file_ext=source_file_ext) for item in items } check_for_missing_files(required_source_files, existing_source_files, "clustering") if not overwrite: existing_files = os.listdir(target_folder) items = get_no_overwrite_items(items, "", existing_files) return items
def cd_cluster_texts( config, dataset, source_folder, target_folder, reference_parsed_folders, regulations, ): source_filename_base = filename_for_pp_config(**config, file_ext="") clustering = get_clustering_result( f"{source_folder}/{source_filename_base}{source_file_ext}", dataset, graph_type="clustering", regulations=regulations, ) result_path = ensure_exists(f"{target_folder}/{source_filename_base}") reference_parsed_files = { os.path.splitext(f)[0]: f for reference_parsed_folder in reference_parsed_folders for f in list_dir(reference_parsed_folder, ".xml") } reference_parsed_files = { ("_".join(k.split("_")[:2] + k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f for k, f in reference_parsed_files.items() } assert len([ file for reference_parsed_folder in reference_parsed_folders for file in list_dir(reference_parsed_folder, ".xml") ]) == len(reference_parsed_files) for idx, community_nodes in enumerate(clustering.communities): community_text = get_community_text(community_nodes, reference_parsed_folders, reference_parsed_files) write_community_text(result_path, idx, community_text)
def get_no_overwrite_items(items, target_file_ext, existing_files): return [ item for item in items if filename_for_pp_config( **item, file_ext=target_file_ext) not in existing_files ]
def add_co_occurrences(config, G, G_orig, nodes_mapping, decision_network_path): C = get_decision_network(decision_network_path) cooccurrence_weight = ( config["pp_co_occurrence"] if config["pp_co_occurrence"] > 0 else 1 ) if config["pp_co_occurrence_type"] == "decision": merge_decisions = True elif config["pp_co_occurrence_type"] == "paragraph": merge_decisions = False else: raise Exception(f"{config['pp_co_occurrence_type']} is not a valid option") C_merged = quotient_decision_graph( C, merge_decisions=merge_decisions, merge_statutes=False ) nodes_citekey_mapping = { v: k for k, v in nx.get_node_attributes(G, "citekey").items() if v } for k, v in nodes_mapping.items(): if "citekey" in G_orig.nodes[k]: citekey = G_orig.nodes[k]["citekey"] simplified_citekey = simplify_citekey(citekey) if ( simplified_citekey in nodes_citekey_mapping and nodes_citekey_mapping[simplified_citekey] != v ): print( "Conflict:", simplified_citekey, nodes_citekey_mapping[simplified_citekey], v, ) nodes_citekey_mapping[simplified_citekey] = v missing_nodes = Counter() co_occurrence_edges = Counter() for node in C_merged.nodes: if C_merged.nodes[node]["bipartite"] == "decision": edges = [ (u, v, d["weight"]) for u, v, d in C_merged.edges(node, data=True) if d["edge_type"] == "reference" ] simplified_citekeys = [simplify_citekey(v) for u, v, w in edges] targets = { nodes_citekey_mapping[v] for v in simplified_citekeys if v in nodes_citekey_mapping } targets_missing = { v for v in simplified_citekeys if v not in nodes_citekey_mapping } missing_nodes.update(targets_missing) targets_combinations = list(itertools.combinations(targets, 2)) for target_a, target_b in targets_combinations: co_occurrence_edges[(target_a, target_b)] += 1 G.add_edges_from( [ ( u, v, dict(weight=cnt * cooccurrence_weight, edge_type="cooccurrence"), ) for (u, v), cnt in co_occurrence_edges.items() ] ) G.add_edges_from( [ ( v, u, dict( weight=cnt * cooccurrence_weight, edge_type="cooccurrence", reverse=True, ), ) for (u, v), cnt in co_occurrence_edges.items() ] ) if config["pp_co_occurrence"] == -2: # Set weight of co-occurrence-edges that sum of weights equals # the sum of weights of cross-references total_weight_cooccurrence = sum( G.edges[u, v, k]["weight"] for u, v, k, edge_type in G.edges(keys=True, data="edge_type") if edge_type == "cooccurrence" ) total_weight_reference = sum( G.edges[u, v, k]["weight"] for u, v, k, edge_type in G.edges(keys=True, data="edge_type") if edge_type == "reference" ) cooccurrence_factor = total_weight_reference / total_weight_cooccurrence print( "cooccurrence_factor", cooccurrence_factor, "for", filename_for_pp_config(**config, file_ext=""), ) for u, v, k, edge_type in G.edges(keys=True, data="edge_type"): if edge_type == "cooccurrence": G.edges[u, v, k]["weight"] *= cooccurrence_factor total_weight_cooccurrence = sum( G.edges[u, v, k]["weight"] for u, v, k, edge_type in G.edges(keys=True, data="edge_type") if edge_type == "cooccurrence" ) total_weight_reference = sum( G.edges[u, v, k]["weight"] for u, v, k, edge_type in G.edges(keys=True, data="edge_type") if edge_type == "reference" ) return missing_nodes