def cd_cluster(config, source_folder, target_folder):
    source_filename = filename_for_pp_config(
        **{
            **config,
            "seed": None,
            "markov_time": None,
            "number_of_modules": None,
            "consensus": None,
            "method": None,
        },
        file_ext=source_file_ext,
    )
    g = nx.read_gpickle(f"{source_folder}/{source_filename}")

    g = compile_source_graph(g, config["method"])

    if not config["consensus"]:
        clustering, D = cluster(g, config, return_tree=True)

        tree_path = (target_folder + "/" +
                     filename_for_pp_config(**config, file_ext=".gpickle.gz"))
        nx.write_gpickle(D, tree_path)

    else:
        clustering = consensus_clustering(g, config)
        target_filename = filename_for_pp_config(**config,
                                                 file_ext="target_file_ext")

    clustering = missings_nodes_as_additional_clusters(clustering)

    target_filename = filename_for_pp_config(**config,
                                             file_ext=target_file_ext)
    write_community_json(clustering, f"{target_folder}/{target_filename}")
def cd_cluster_evolution_inspection_prepare(
    overwrite,
    cluster_mapping_configs,
    source_folder,
    crossreference_graph_folder,
    target_folder,
):
    ensure_exists(target_folder)

    configs = get_configs(cluster_mapping_configs)

    existing_files = set(list_dir(target_folder, ".htm"))
    if not overwrite:
        configs = [
            config for config in configs
            if filename_for_pp_config(snapshot="all",
                                      **config,
                                      file_ext=".htm") not in existing_files
        ]
    if configs:
        global cd_cluster_evolution_inspection_graphs
        cd_cluster_evolution_inspection_graphs = {
            f[:-len(".gpickle.gz")]: hierarchy_graph(
                nx.read_gpickle(os.path.join(crossreference_graph_folder, f)))
            for f in list_dir(crossreference_graph_folder, ".gpickle.gz")
        }

    return configs
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder,
                       target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(
            **{
                **item,
                "seed": None,
                "markov_time": None,
                "number_of_modules": None,
                "consensus": None,
                "method": None,
            },
            file_ext=source_file_ext,
        )
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "preprocessed graphs")

    if not overwrite:
        existing_files = list_dir(target_folder, target_file_ext)
        items = get_no_overwrite_items(items, target_file_ext, existing_files)

    return items
def get_configs_no_overwrite(configs, existing_files):
    configs = [
        config for config in configs
        if filename_for_pp_config(snapshot="all",
                                  **config,
                                  file_ext=".gpickle.gz") not in existing_files
    ]
    return configs
def cd_preprocessing(config, source_folder, target_folder, decision_network_path):
    source_path = f"{source_folder}/seqitems/{config['snapshot']}.gpickle.gz"
    graph_target_path = (
        f"{target_folder}/{filename_for_pp_config(**config, file_ext=target_file_ext)}"
    )
    missing_nodes_target_path = os.path.join(
        target_folder,
        filename_for_pp_config(**config, file_ext="_missing_co_occurr_nodes.csv"),
    )

    seq_decay_func = decay_function(config["pp_decay"])

    G = nx.read_gpickle(source_path)

    # Remove authority edges
    G.remove_edges_from(
        [
            (u, v, k)
            for u, v, k, d in G.edges(keys=True, data="edge_type")
            if d == "authority"
        ]
    )

    mqG, nodes_mapping = quotient_graph_with_merge(
        G, merge_threshold=config["pp_merge"]
    )

    smqG = sequence_graph(
        mqG, seq_decay_func=seq_decay_func, seq_ref_ratio=config["pp_ratio"]
    )

    check_missing_edges(mqG, smqG)

    if config["pp_co_occurrence"] != 0:
        missing_nodes = add_co_occurrences(
            config, smqG, G, nodes_mapping, decision_network_path
        )

        pd.DataFrame(
            list(missing_nodes.items()), columns=["missing_node", "count"]
        ).sort_values("count", ascending=False).to_csv(
            missing_nodes_target_path, index=False
        )

    if config["pp_co_occurrence"] == -1:
        edges_to_remove = [
            (u, v, k)
            for u, v, k, t in smqG.edges(keys=True, data="edge_type")
            if t == "reference"
        ]
        smqG.remove_edges_from(edges_to_remove)

    nx.write_gpickle(smqG, graph_target_path)
def cd_cluster_evolution_inspection(config, dataset, source_folder,
                                    target_folder):
    global cd_cluster_evolution_inspection_graphs
    source_filename_base = filename_for_pp_config(snapshot="all",
                                                  **config,
                                                  file_ext="")

    G = nx.read_gpickle(
        os.path.join(source_folder, source_filename_base + ".gpickle.gz"))

    families = cluster_families(G, 0.15)
    destination = f"{target_folder}/{source_filename_base}.htm"
    generate_inspection(G, families, destination)
def get_clustering_result(cluster_path,
                          dataset,
                          graph_type,
                          path_prefix="",
                          regulations=False):
    """
    read the clustering result and the respective graph.
    ::param cluster_path: path of the cdlib.readwrite.write_community_json output
    ::param dataset: 'de' or 'us'
    ::param graph_type: 'clustering' for the rolled up graph.
        Other options: subseqitems, seqitems
    """

    filename_base = os.path.splitext(os.path.split(cluster_path)[-1])[0]
    snapshot = filename_base.split("_")[0]

    if graph_type == "clustering":
        config = get_config_from_filename(filename_base)
        graph_filename = filename_for_pp_config(
            **simplify_config_for_preprocessed_graph(config))
        graph_path = path_prefix + (
            (US_REG_CD_PREPROCESSED_GRAPH_PATH if regulations else
             US_CD_PREPROCESSED_GRAPH_PATH) if dataset.lower() == "us" else
            (DE_REG_CD_PREPROCESSED_GRAPH_PATH
             if regulations else DE_CD_PREPROCESSED_GRAPH_PATH))
        graph_path += f"/{graph_filename}"
        G = nx.read_gpickle(graph_path)
    elif graph_type in ["seqitems", "subseqitems"]:
        graph_path = path_prefix + (
            (US_REG_CROSSREFERENCE_GRAPH_PATH if regulations else
             US_CROSSREFERENCE_GRAPH_PATH) if dataset.lower() == "us" else
            (DE_REG_CROSSREFERENCE_GRAPH_PATH
             if regulations else DE_CROSSREFERENCE_GRAPH_PATH))

        graph_path += f"/{graph_type}/{snapshot}.gpickle.gz"
        G = nx.read_gpickle(graph_path)

    else:
        raise Exception(f"graph_type {graph_type} not allowed")

    clustering = readwrite.read_community_json(path_prefix + (
        (US_REG_CD_CLUSTER_PATH if regulations else US_CD_CLUSTER_PATH
         ) if dataset.lower() == "us" else
        (DE_REG_CD_CLUSTER_PATH if regulations else DE_CD_CLUSTER_PATH)) +
                                               "/" +
                                               os.path.split(cluster_path)[-1])
    clustering.graph = G

    add_communities_to_graph(clustering)

    return clustering
Exemple #8
0
 def test_filename_for_pp_config(self):
     self.assertEqual(
         filename_for_pp_config(*config_dict_to_list(self.config)),
         self.filename,
     )
     other_config = self.config.copy()
     other_config["method"] = "louvain"
     self.assertEqual(
         filename_for_pp_config(*config_dict_to_list(other_config)),
         self.other_filename,
     )
     for attr in [
         "pp_co_occurrence",
         "method",
         "number_of_modules",
         "markov_time",
         "seed",
         "consensus",
     ]:
         other_config[attr] = None
     self.assertEqual(
         filename_for_pp_config(*config_dict_to_list(other_config)),
         self.simple_filename,
     )
def get_config_clustering_files(config, source_folder):
    """
    get all clusterings for a given config. (Multiple snapshots to be mapped)
    ::return filenames, snapshots
    """
    existing_clustering = set(list_dir(source_folder, ".json"))
    config_filename_part = filename_for_pp_config(snapshot="",
                                                  **config,
                                                  file_ext=".json")
    config_clustering_files = sorted(
        [x for x in existing_clustering if x.endswith(config_filename_part)])
    snapshots = sorted([
        config_clustering_file.split("_")[0]
        for config_clustering_file in config_clustering_files
    ])
    return config_clustering_files, snapshots
Exemple #10
0
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder,
                             target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(**item, file_ext=source_file_ext)
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "clustering")

    if not overwrite:
        existing_files = os.listdir(target_folder)
        items = get_no_overwrite_items(items, "", existing_files)

    return items
Exemple #11
0
def cd_cluster_texts(
    config,
    dataset,
    source_folder,
    target_folder,
    reference_parsed_folders,
    regulations,
):
    source_filename_base = filename_for_pp_config(**config, file_ext="")

    clustering = get_clustering_result(
        f"{source_folder}/{source_filename_base}{source_file_ext}",
        dataset,
        graph_type="clustering",
        regulations=regulations,
    )
    result_path = ensure_exists(f"{target_folder}/{source_filename_base}")

    reference_parsed_files = {
        os.path.splitext(f)[0]: f
        for reference_parsed_folder in reference_parsed_folders
        for f in list_dir(reference_parsed_folder, ".xml")
    }
    reference_parsed_files = {
        ("_".join(k.split("_")[:2] +
                  k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f
        for k, f in reference_parsed_files.items()
    }
    assert len([
        file for reference_parsed_folder in reference_parsed_folders
        for file in list_dir(reference_parsed_folder, ".xml")
    ]) == len(reference_parsed_files)

    for idx, community_nodes in enumerate(clustering.communities):
        community_text = get_community_text(community_nodes,
                                            reference_parsed_folders,
                                            reference_parsed_files)
        write_community_text(result_path, idx, community_text)
Exemple #12
0
def get_no_overwrite_items(items, target_file_ext, existing_files):
    return [
        item for item in items if filename_for_pp_config(
            **item, file_ext=target_file_ext) not in existing_files
    ]
def add_co_occurrences(config, G, G_orig, nodes_mapping, decision_network_path):
    C = get_decision_network(decision_network_path)
    cooccurrence_weight = (
        config["pp_co_occurrence"] if config["pp_co_occurrence"] > 0 else 1
    )

    if config["pp_co_occurrence_type"] == "decision":
        merge_decisions = True
    elif config["pp_co_occurrence_type"] == "paragraph":
        merge_decisions = False
    else:
        raise Exception(f"{config['pp_co_occurrence_type']} is not a valid option")

    C_merged = quotient_decision_graph(
        C, merge_decisions=merge_decisions, merge_statutes=False
    )

    nodes_citekey_mapping = {
        v: k for k, v in nx.get_node_attributes(G, "citekey").items() if v
    }
    for k, v in nodes_mapping.items():
        if "citekey" in G_orig.nodes[k]:
            citekey = G_orig.nodes[k]["citekey"]
            simplified_citekey = simplify_citekey(citekey)
            if (
                simplified_citekey in nodes_citekey_mapping
                and nodes_citekey_mapping[simplified_citekey] != v
            ):
                print(
                    "Conflict:",
                    simplified_citekey,
                    nodes_citekey_mapping[simplified_citekey],
                    v,
                )
            nodes_citekey_mapping[simplified_citekey] = v

    missing_nodes = Counter()

    co_occurrence_edges = Counter()
    for node in C_merged.nodes:
        if C_merged.nodes[node]["bipartite"] == "decision":
            edges = [
                (u, v, d["weight"])
                for u, v, d in C_merged.edges(node, data=True)
                if d["edge_type"] == "reference"
            ]

            simplified_citekeys = [simplify_citekey(v) for u, v, w in edges]

            targets = {
                nodes_citekey_mapping[v]
                for v in simplified_citekeys
                if v in nodes_citekey_mapping
            }
            targets_missing = {
                v for v in simplified_citekeys if v not in nodes_citekey_mapping
            }
            missing_nodes.update(targets_missing)

            targets_combinations = list(itertools.combinations(targets, 2))
            for target_a, target_b in targets_combinations:
                co_occurrence_edges[(target_a, target_b)] += 1

    G.add_edges_from(
        [
            (
                u,
                v,
                dict(weight=cnt * cooccurrence_weight, edge_type="cooccurrence"),
            )
            for (u, v), cnt in co_occurrence_edges.items()
        ]
    )
    G.add_edges_from(
        [
            (
                v,
                u,
                dict(
                    weight=cnt * cooccurrence_weight,
                    edge_type="cooccurrence",
                    reverse=True,
                ),
            )
            for (u, v), cnt in co_occurrence_edges.items()
        ]
    )

    if config["pp_co_occurrence"] == -2:
        # Set weight of co-occurrence-edges that sum of weights equals
        # the sum of weights of cross-references
        total_weight_cooccurrence = sum(
            G.edges[u, v, k]["weight"]
            for u, v, k, edge_type in G.edges(keys=True, data="edge_type")
            if edge_type == "cooccurrence"
        )
        total_weight_reference = sum(
            G.edges[u, v, k]["weight"]
            for u, v, k, edge_type in G.edges(keys=True, data="edge_type")
            if edge_type == "reference"
        )

        cooccurrence_factor = total_weight_reference / total_weight_cooccurrence
        print(
            "cooccurrence_factor",
            cooccurrence_factor,
            "for",
            filename_for_pp_config(**config, file_ext=""),
        )

        for u, v, k, edge_type in G.edges(keys=True, data="edge_type"):
            if edge_type == "cooccurrence":
                G.edges[u, v, k]["weight"] *= cooccurrence_factor

        total_weight_cooccurrence = sum(
            G.edges[u, v, k]["weight"]
            for u, v, k, edge_type in G.edges(keys=True, data="edge_type")
            if edge_type == "cooccurrence"
        )
        total_weight_reference = sum(
            G.edges[u, v, k]["weight"]
            for u, v, k, edge_type in G.edges(keys=True, data="edge_type")
            if edge_type == "reference"
        )

    return missing_nodes