def cd_cluster_evolution_inspection_prepare(
    overwrite,
    cluster_mapping_configs,
    source_folder,
    crossreference_graph_folder,
    target_folder,
):
    ensure_exists(target_folder)

    configs = get_configs(cluster_mapping_configs)

    existing_files = set(list_dir(target_folder, ".htm"))
    if not overwrite:
        configs = [
            config for config in configs
            if filename_for_pp_config(snapshot="all",
                                      **config,
                                      file_ext=".htm") not in existing_files
        ]
    if configs:
        global cd_cluster_evolution_inspection_graphs
        cd_cluster_evolution_inspection_graphs = {
            f[:-len(".gpickle.gz")]: hierarchy_graph(
                nx.read_gpickle(os.path.join(crossreference_graph_folder, f)))
            for f in list_dir(crossreference_graph_folder, ".gpickle.gz")
        }

    return configs
def cd_preprocessing_prepare(
    overwrite, snapshots, pp_configs, source_folder, target_folder
):
    ensure_exists(target_folder)
    items = [
        dict(
            snapshot=snapshot,
            pp_ratio=pp_ratio,
            pp_decay=pp_decay,
            pp_merge=pp_merge,
            pp_co_occurrence=pp_co_occurrence,
            pp_co_occurrence_type=pp_co_occurrence_type,
        )
        for snapshot in snapshots
        for pp_ratio in pp_configs["pp_ratios"]
        for pp_decay in pp_configs["pp_decays"]
        for pp_merge in pp_configs["pp_merges"]
        for pp_co_occurrence in pp_configs["pp_co_occurrences"]
        for pp_co_occurrence_type in pp_configs["pp_co_occurrence_types"]
    ]

    # Check if source graphs exist
    existing_source_files = set(list_dir(f"{source_folder}/seqitems", ".gpickle.gz"))
    required_source_files = {f"{snapshot}.gpickle.gz" for snapshot in snapshots}
    check_for_missing_files(required_source_files, existing_source_files, "graphs")

    if not overwrite:
        existing_files = list_dir(target_folder, target_file_ext)
        items = get_no_overwrite_items(items, target_file_ext, existing_files)

    return items
Exemple #3
0
def cd_cluster_evolution_mappings_prepare(overwrite, cluster_mapping_configs,
                                          source_folder, target_folder,
                                          snapshots):
    ensure_exists(target_folder)

    subseqitems_snapshots = [
        f.split(".")[0] for f in list_dir(f"{source_folder}/", ".edges.csv.gz")
    ]  # fix

    if snapshots:
        subseqitems_snapshots = [
            s for s in subseqitems_snapshots if s in snapshots
        ]

    # get configs
    mappings = [
        dict(
            pp_merge=pp_merge,
            snapshot=subseqitems_snapshot,
        ) for pp_merge in cluster_mapping_configs["pp_merges"]
        for subseqitems_snapshot in subseqitems_snapshots
    ]

    existing_files = set(list_dir(target_folder, ".pickle"))
    if not overwrite:
        mappings = [
            mapping for mapping in mappings
            if filename_for_mapping(mapping) not in existing_files
        ]

    return sorted(mappings, key=str)
def cd_cluster_evolution_graph_prepare(
    overwrite,
    cluster_mapping_configs,
    source_folder,
    snaphot_mapping_folder,
    subseqitem_mapping_folder,
    target_folder,
):
    ensure_exists(target_folder)
    configs = get_configs(cluster_mapping_configs)

    # Check if clusterings exist
    for config in configs:
        config_clustering_files, snapshots = get_config_clustering_files(
            config, source_folder)

        mapping_files = list_dir(snaphot_mapping_folder, ".json")
        check_mapping_files(mapping_files, snapshots, config, ".json")

        mapping_files = list_dir(subseqitem_mapping_folder, ".pickle")
        check_mapping_files(mapping_files, snapshots, config, ".pickle")

    existing_files = set(list_dir(target_folder, ".gpickle.gz"))
    if not overwrite:
        get_configs_no_overwrite(configs, existing_files)

    return configs
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder,
                       target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(
            **{
                **item,
                "seed": None,
                "markov_time": None,
                "number_of_modules": None,
                "consensus": None,
                "method": None,
            },
            file_ext=source_file_ext,
        )
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "preprocessed graphs")

    if not overwrite:
        existing_files = list_dir(target_folder, target_file_ext)
        items = get_no_overwrite_items(items, target_file_ext, existing_files)

    return items
Exemple #6
0
def reference_parse_areas(regulations):
    global law_names
    law_names = load_law_names_compiled(regulations)
    ensure_exists(DE_DECISIONS_REFERENCE_AREAS)
    ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML)
    decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml")
    with multiprocessing.Pool() as p:
        p.map(find_references, decisions)
 def get_items(self, overwrite, snapshots) -> list:
     ensure_exists(self.destination)
     items = snapshots
     if not overwrite:
         existing_files = list_dir(self.destination, ".pickle")
         items = list(
             filter(lambda x: (x + ".pickle") not in existing_files, items))
     return items
Exemple #8
0
 def finish_execution(self, results):
     logs = list(itertools.chain.from_iterable(results))
     ensure_exists(
         US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH)
     log_path = (US_REG_REFERENCE_AREAS_LOG_PATH
                 if self.regulations else US_REFERENCE_AREAS_LOG_PATH)
     with open(log_path, mode="w") as f:
         f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
 def get_items(self, snapshots) -> list:
     ensure_exists(DE_REG_CROSSREFERENCE_LOOKUP_PATH if self.
                   regulations else DE_CROSSREFERENCE_LOOKUP_PATH)
     files = []
     law_names_data = load_law_names(self.regulations)
     for snapshot in snapshots:
         files.append(
             (snapshot, get_snapshot_law_list(snapshot, law_names_data)))
     return files
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(DE_REG_AUTHORITY_EDGELIST_PATH)

        if not overwrite:
            existing_files = os.listdir(DE_REG_AUTHORITY_EDGELIST_PATH)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
Exemple #11
0
    def get_items(self, overwrite) -> list:
        ensure_exists(self.destination)
        files = list_dir(self.source, ".xml")

        if not overwrite:
            existing_files = list_dir(self.destination, ".gpickle")
            files = list(
                filter(lambda f: get_gpickle_filename(f) not in existing_files,
                       files))

        return files
Exemple #12
0
    def get_items(self, overwrite) -> list:
        src = US_REG_XML_PATH if self.regulations else US_XML_PATH
        dest = (US_REG_REFERENCE_AREAS_PATH
                if self.regulations else US_REFERENCE_AREAS_PATH)
        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = os.listdir(dest)
            files = list(filter(lambda f: f not in existing_files, files))

        return files
Exemple #13
0
    def get_items(self, overwrite, snapshots) -> list:
        target_folder = (DE_REG_CROSSREFERENCE_EDGELIST_PATH if
                         self.regulations else DE_CROSSREFERENCE_EDGELIST_PATH)
        ensure_exists(target_folder)

        if not overwrite:
            existing_files = os.listdir(target_folder)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
def copy_selected_doknrs(selection_list, target_dir):
    ensure_exists(target_dir)
    for doknr in selection_list:
        version_filenames = [
            f for f in os.listdir(f"{JURIS_EXPORT_PATH}/{doknr}")
            if f.endswith(".xml")
        ]
        for version_filename in version_filenames:
            assert len(version_filename.split("_")) == 3
            shutil.copy(
                f"{JURIS_EXPORT_PATH}/{doknr}/{version_filename}",
                f"{target_dir}/{version_filename}",
            )
Exemple #15
0
def us_prepare_input():
    """
    moves source files into main dir and validate files roughly
    """

    ensure_exists(US_ORIGINAL_PATH)

    subfolders = [f.name for f in os.scandir(US_INPUT_PATH) if f.is_dir()]
    for subfolder in subfolders:
        for item in os.listdir(f"{US_INPUT_PATH}/{subfolder}"):

            # Filter by filename pattern
            pattern = re.compile(r"(\d+)usc(\d+)(a)?\.html?",
                                 flags=re.IGNORECASE)
            match = pattern.fullmatch(item)
            if not match:
                continue

            new_name = f'{match[2]}{"1" if match[3] else "0"}_{match[1]}.htm'

            # Prevent overwriting files
            if os.path.exists(f"{US_ORIGINAL_PATH}/{new_name}"):
                print(f"{US_ORIGINAL_PATH}/{new_name} already exists")
            else:
                shutil.copy(
                    f"{US_INPUT_PATH}/{subfolder}/{item}",
                    f"{US_ORIGINAL_PATH}/{new_name}",
                )

    files = os.listdir(US_ORIGINAL_PATH)
    files = [f for f in files if f.endswith(".htm")]
    pattern = re.compile(r"(\d+)_(\d+)\.htm")
    years = {}
    for file in files:
        match = pattern.fullmatch(file)
        year = match[2]
        title = match[1]
        years[year] = years[year] if years.get(year) else []
        years[year].append(title)

    for idx in list(years.keys()):
        years[idx] = sorted(years[idx])

    print(f"{len(files)} files found")
    print(f"{len(years)} years found")

    for year in sorted(years.keys()):
        titles = years[year]
        print(f"{year}: n={len(titles)}, max='{max(titles)}'")
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.dest)
        if not snapshots:
            snapshots = sorted(
                set([
                    os.path.splitext(x)[0]
                    for x in list_dir(self.lookup, ".csv")
                ]))

        if not overwrite:
            existing_files = os.listdir(self.dest)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.destination)
        items = sorted(list_dir(self.source, ".pickle"))
        items = [i[:-len(".pickle")] for i in items]

        # Create mappings to draw the edges
        mappings = [(file1, file2) for file1, file2 in zip(
            items[:-self.interval], items[self.interval:])]

        if snapshots:
            mappings = list(filter(lambda f: f[0] in snapshots, mappings))

        if not overwrite:
            existing_files = list_dir(self.destination, ".json")
            mappings = list(
                filter(lambda x: mapping_filename(x) not in existing_files,
                       mappings))

        return mappings
Exemple #18
0
    def get_items(self, overwrite) -> list:
        src = (DE_REG_REFERENCE_AREAS_PATH
               if self.regulations else DE_REFERENCE_AREAS_PATH)
        dest = (DE_REG_REFERENCE_PARSED_PATH
                if self.regulations else DE_REFERENCE_PARSED_PATH)

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = os.listdir(dest)
            files = list(filter(lambda f: f not in existing_files, files))

        copy_xml_schema_to_data_folder()

        return files
Exemple #19
0
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder,
                             target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(**item, file_ext=source_file_ext)
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "clustering")

    if not overwrite:
        existing_files = os.listdir(target_folder)
        items = get_no_overwrite_items(items, "", existing_files)

    return items
Exemple #20
0
    def get_items(self, overwrite) -> list:
        src = DE_REG_ORIGINAL_PATH if self.regulations else DE_ORIGINAL_PATH
        dest = DE_REG_XML_PATH if self.regulations else DE_XML_PATH

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = list_dir(dest, ".xml")

            # Remove cite_key
            converted_existing_files = [
                f.split("_")[0] + "_" + "_".join(f.split("_")[2:])
                for f in existing_files
            ]
            files = list(
                filter(lambda f: f not in converted_existing_files, files))

        return sorted(files)
Exemple #21
0
    def get_items(self, overwrite, snapshots) -> list:
        dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH
                if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH)
        ensure_exists(dest)

        # If snapshots not set, create list of all years
        if not snapshots:
            snapshots = sorted(
                set([
                    x.split(".")[0].split("_")[-1]
                    for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
                ]))

        if not overwrite:
            existing_files = os.listdir(dest)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
Exemple #22
0
    def get_items(self, overwrite) -> list:
        # Create target folder
        ensure_exists(US_XML_PATH)

        # Get source files
        files = list_dir(US_ORIGINAL_PATH, ".htm")

        # Filter appendices
        pattern = re.compile(r"\d+0_\d+\.htm")
        html_files = list(filter(pattern.fullmatch, files))

        # Prevent file overwrite
        if not overwrite:
            existing_files = list_dir(US_XML_PATH, ".xml")
            existing_files_sources = list(
                map(lambda x: x.replace(".xml", ".htm"), existing_files))

            html_files = list(
                filter(lambda f: f not in existing_files_sources, html_files))

        return html_files
Exemple #23
0
def download():
    ensure_exists(DE_DECISIONS_TEMP_DATA_PATH)
    toc = requests.get(
        "https://www.rechtsprechung-im-internet.de/rii-toc.xml").text
    with open(DE_DECISIONS_DOWNLOAD_TOC, "w") as f:
        f.write(toc)

    with open(DE_DECISIONS_DOWNLOAD_TOC) as f:
        toc = f.read()
    soup = BeautifulSoup(toc, "lxml-xml")
    len(soup.findAll("item"))

    ensure_exists(DE_DECISIONS_DOWNLOAD_ZIP)
    items = [i.link.text for i in soup.findAll("item")]
    with Pool(4) as p:
        p.map(download_item, items)

    ensure_exists(DE_DECISIONS_DOWNLOAD_XML)

    i = 0
    for filename in os.listdir(DE_DECISIONS_DOWNLOAD_ZIP):
        if os.path.splitext(filename)[1] == ".zip":
            zip_ref = zipfile.ZipFile(
                f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "r")
            zip_ref.extractall(DE_DECISIONS_DOWNLOAD_XML)
            zip_ref.close()
            i += 1
            print(f"\r{i} entpackt", end="")
Exemple #24
0
def cd_cluster_texts(
    config,
    dataset,
    source_folder,
    target_folder,
    reference_parsed_folders,
    regulations,
):
    source_filename_base = filename_for_pp_config(**config, file_ext="")

    clustering = get_clustering_result(
        f"{source_folder}/{source_filename_base}{source_file_ext}",
        dataset,
        graph_type="clustering",
        regulations=regulations,
    )
    result_path = ensure_exists(f"{target_folder}/{source_filename_base}")

    reference_parsed_files = {
        os.path.splitext(f)[0]: f
        for reference_parsed_folder in reference_parsed_folders
        for f in list_dir(reference_parsed_folder, ".xml")
    }
    reference_parsed_files = {
        ("_".join(k.split("_")[:2] +
                  k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f
        for k, f in reference_parsed_files.items()
    }
    assert len([
        file for reference_parsed_folder in reference_parsed_folders
        for file in list_dir(reference_parsed_folder, ".xml")
    ]) == len(reference_parsed_files)

    for idx, community_nodes in enumerate(clustering.communities):
        community_text = get_community_text(community_nodes,
                                            reference_parsed_folders,
                                            reference_parsed_files)
        write_community_text(result_path, idx, community_text)
def hierarchy():
    ensure_exists(DE_DECISIONS_HIERARCHY)
    decisions = list_dir(DE_DECISIONS_XML, ".xml")

    with multiprocessing.Pool() as p:
        p.map(extract_hierarchy, decisions)
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.destination + "/seqitems")
        if not snapshots:
            snapshots = sorted(
                set(
                    [
                        os.path.splitext(x)[0]
                        for x in list_dir(self.edgelist_folder, ".csv")
                    ]
                )
            )

        if not overwrite:
            existing_files = list_dir(
                os.path.join(self.destination, "seqitems"), ".gpickle.gz"
            )
            snapshots = list(
                filter(
                    lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots
                )
            )

        if not len(snapshots):
            return []

        if self.dataset == "us":
            files = []
            for snapshot in snapshots:
                statute_files = [
                    f"{self.source}/subseqitems/{x}"
                    for x in os.listdir(os.path.join(self.source, "subseqitems"))
                    if str(snapshot) in x
                ]
                regulation_files = (
                    [
                        f"{self.source_regulation}/subseqitems/{x}"
                        for x in os.listdir(
                            os.path.join(self.source_regulation, "subseqitems")
                        )
                        if str(snapshot) in x
                    ]
                    if self.regulations
                    else None
                )
                files.append(
                    (
                        snapshot,
                        statute_files,
                        regulation_files,
                    )
                )
        else:  # is DE
            files = []
            law_names_data = load_law_names(self.regulations)
            for snapshot in snapshots:
                graph_files = get_snapshot_law_list(snapshot, law_names_data)
                files.append(
                    (
                        snapshot,
                        [
                            f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}'
                            for x in graph_files
                        ],
                        None,
                    )
                )

        return files
def copy_xml_schema_to_data_folder():
    ensure_exists(DATA_PATH)
    shutil.copyfile("xml-schema.xsd", os.path.join(DATA_PATH,
                                                   "xml-schema.xsd"))
    shutil.copyfile("xml-styles.css", os.path.join(DATA_PATH,
                                                   "xml-styles.css"))
import os
import shutil
from multiprocessing.pool import Pool

import requests
from quantlaw.utils.files import ensure_exists

from statics import US_REG_INPUT_PATH

DOWNLOAD_BASE_URL = "https://www.govinfo.gov/bulkdata/CFR/{}/CFR-{}.zip"


def download(year):
    zip_path = f"{US_REG_INPUT_PATH}/{year}.zip"
    if not os.path.exists(zip_path):
        print("loading", year)
        r = requests.get(DOWNLOAD_BASE_URL.format(year, year), stream=True)
        if r.status_code == 200:
            with open(zip_path, "wb") as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
            print("downloaded", year)


if __name__ == "__main__":

    ensure_exists(US_REG_INPUT_PATH)
    with Pool(4) as p:
        p.map(download, list(range(1996, 2020 + 1)))
def clean():
    ensure_exists(DE_DECISIONS_XML)
    decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml")
    with multiprocessing.Pool() as p:
        p.map(clean_decision, decisions)