Exemple #1
0
def remap_table_master_to_wp(cursor, table_name, wp_name):
    """
    Update primary key values from "master" fids to "WP" fids.

    For each row:
    - remap row exists for master_fid -> use wp_fid
    - remap does not exist for master_fid -> insert (master_fid, 1000000+master_fid)
    """
    remap_table_escaped = remap_table_name(table_name, wp_name)
    _create_remap_table_if_not_exists(cursor, remap_table_escaped)

    pkey_column = _table_pkey(cursor, table_name)
    table_name_escaped = escape_double_quotes(table_name)
    pkey_column_escaped = escape_double_quotes(pkey_column)
    # 1. find missing mapped ids
    master_fids_missing = set()
    sql = (
        f"""SELECT {pkey_column_escaped} FROM {table_name_escaped} """
        f"""LEFT JOIN {remap_table_escaped} AS mapped ON fid = mapped.master_fid WHERE mapped.wp_fid IS NULL"""
    )
    for row in cursor.execute(sql):
        master_fids_missing.add(row[0])

    # 2. insert missing mapped ids
    cursor.execute(f"""SELECT max(wp_fid) FROM {remap_table_escaped}""")
    new_wp_fid = cursor.fetchone()[0]
    if new_wp_fid is None:
        new_wp_fid = 1000000  # empty table so far
    else:
        new_wp_fid += 1

    for master_fid in master_fids_missing:
        cursor.execute(f"""INSERT INTO {remap_table_escaped} VALUES (?, ?)""",
                       (master_fid, new_wp_fid))
        new_wp_fid += 1

    # 3. remap master ids to WP ids
    mapping = []
    sql = (
        f"""SELECT {pkey_column_escaped}, mapped.wp_fid FROM {table_name_escaped} """
        f"""LEFT JOIN {remap_table_escaped} AS mapped ON fid = mapped.master_fid"""
    )
    for row in cursor.execute(sql):
        mapping.append((row[0], row[1]))

    # hack to hopefully avoid possible pkey violations ... who would use negative ids? :-)
    cursor.execute(
        f"""UPDATE {table_name_escaped} SET {pkey_column_escaped} = -{pkey_column_escaped};"""
    )

    for master_fid, wp_fid in mapping:
        cursor.execute(
            f"""UPDATE {table_name_escaped} SET {pkey_column_escaped} = ? WHERE {pkey_column_escaped} = ?""",
            (wp_fid, -master_fid))
def _assert_value_equals(gpkg_filename, table_name, fid, field_name,
                         expected_value):
    """Raises assertion error if value of a particular field of a given feature
    does not equal the expected value"""
    db = sqlite3.connect(gpkg_filename)
    c = db.cursor()
    field_name_escaped = escape_double_quotes(field_name)
    table_name_escaped = escape_double_quotes(table_name)
    c.execute(
        f"""SELECT {field_name_escaped} FROM {table_name_escaped} WHERE fid = ?""",
        (fid, ))
    row = c.fetchone()
    if row is None:
        assert False, f"Missing row for fid {fid}"
    assert row[0] == expected_value
Exemple #3
0
def remap_table_wp_to_master(cursor, table_name, wp_name, new_master_fid):
    """
    Update primary key values from "WP" fids to "master" fids.

    For each row:
    - remap row exists for wp_fid -> use master_fid
    - remap does not exist for wp_fid -> insert ([first unused master fid], wp_fid)
    """

    remap_table = remap_table_name(table_name, wp_name)
    _create_remap_table_if_not_exists(cursor, remap_table)

    pkey_column = _table_pkey(cursor, table_name)
    table_name_escaped = escape_double_quotes(table_name)
    pkey_column_escaped = escape_double_quotes(pkey_column)
    # 1. find missing mapped ids
    wp_fids_missing = set()
    sql = (
        f"""SELECT {pkey_column_escaped} FROM {table_name_escaped} """
        f"""LEFT JOIN {remap_table} AS mapped ON fid = mapped.wp_fid WHERE mapped.master_fid IS NULL"""
    )
    for row in cursor.execute(sql):
        wp_fids_missing.add(row[0])

    # 2. insert missing mapped ids
    for wp_fid in wp_fids_missing:
        cursor.execute(f"""INSERT INTO {remap_table} VALUES (?, ?)""",
                       (new_master_fid, wp_fid))
        new_master_fid += 1

    # 3. remap WP ids to master ids
    mapping = []  # list of tuples (wp_fid, master_fid)
    sql = (
        f"""SELECT {pkey_column_escaped}, mapped.master_fid FROM {table_name_escaped} """
        f"""LEFT JOIN {remap_table} AS mapped ON fid = mapped.wp_fid""")
    for row in cursor.execute(sql):
        mapping.append((row[0], row[1]))

    # hack to hopefully avoid possible pkey violations ... who would use negative ids? :-)
    cursor.execute(
        f"""UPDATE {table_name_escaped} SET {pkey_column_escaped} = -{pkey_column_escaped};"""
    )

    for wp_fid, master_fid in mapping:
        cursor.execute(
            f"""UPDATE {table_name_escaped} SET {pkey_column_escaped} = ? WHERE fid = ?""",
            (master_fid, -wp_fid))
def _assert_row_exists(gpkg_filename, table_name, fid):
    """ Raises assertion error if given feature is NOT present in the table """
    db = sqlite3.connect(gpkg_filename)
    c = db.cursor()
    table_name_escaped = escape_double_quotes(table_name)
    c.execute(f"""SELECT count(*) FROM {table_name_escaped} WHERE fid = ?""",
              (fid, ))
    row = c.fetchone()
    assert row[0] == 1, f"Row for fid {fid} is not present but it should be"
Exemple #5
0
def make_work_packages(data_dir, wp_config):
    """
    This is the core part of the algorithm for merging and splitting data for work packages.
    It expects a data directory with layout of directories and files as described in the header
    of this file.

    The first stage collects changes from the master DB and the work package DBs and
    combines them together, resolving any conflicts. At the end of the first stage we have
    updated master database. The second stage then re-creates individual work package DBs.
    """

    base_dir = os.path.join(
        data_dir,
        "base")  # where the non-modified GPKGs from the last run should be
    input_dir = os.path.join(
        data_dir,
        "input")  # where the existing GPKG for each existing WP should be
    output_dir = os.path.join(
        data_dir, "output"
    )  # !!!! we are deleting this directory and recreating it every time!
    tmp_dir = os.path.join(
        data_dir, "tmp")  # for any temporary stuff (also deleted + recreated)

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    old_wp_names = (
        []
    )  # names of WPs that have been processed before (and we expect their GPKGs exist and may be modified)
    if os.path.exists(base_dir):
        for path in Path(base_dir).iterdir():
            filename = path.name
            if filename == "master.gpkg":
                continue  # skip the master file - it's not a work package
            if filename.endswith(".gpkg"):
                wp_name = filename[:-5]  # strip the suffix
                old_wp_names.append(wp_name)
    print("existing WPs: " + str(old_wp_names))

    def _logger_callback(level, text_bytes):
        text = text_bytes.decode()  # convert bytes to str
        print("GEODIFF: ", text)

    geodiff = pygeodiff.GeoDiff()
    geodiff.set_maximum_logger_level(geodiff.LevelDebug)
    geodiff.set_logger_callback(_logger_callback)

    master_gpkg_base = os.path.join(
        base_dir, "master.gpkg")  # should not have been modified
    master_gpkg_input = os.path.join(
        input_dir, "master.gpkg")  # this could have been modified by users
    master_gpkg_output = os.path.join(output_dir,
                                      "master.gpkg")  # does not exist yet

    if os.path.exists(master_gpkg_base):
        # summarize changes that have happened in master (base master VS input master)
        # (this is not needed anywhere in the code, but may be useful for debugging)
        master_base_to_input = os.path.join(tmp_dir, "master-base-input.diff")
        master_base_to_input_json = os.path.join(tmp_dir,
                                                 "master-base-input.json")
        geodiff.create_changeset(master_gpkg_base, master_gpkg_input,
                                 master_base_to_input)
        geodiff.list_changes(master_base_to_input, master_base_to_input_json)

    # create new master_gpkg in the output directory
    shutil.copy(master_gpkg_input, master_gpkg_output)

    # copy "base" remapping DB to "output" where we may be adding some more entries
    remap_db_base = os.path.join(base_dir, "remap.db")
    remap_db_output = os.path.join(output_dir, "remap.db")
    if old_wp_names and not os.path.exists(remap_db_base):
        raise ValueError("remap.db should exist!")
    if not old_wp_names and os.path.exists(remap_db_base):
        raise ValueError("remap.db should not exist yet!")
    if os.path.exists(remap_db_base):
        shutil.copy(remap_db_base, remap_db_output)

    # STAGE 1: Bring the changes from WPs to master
    # (remap WP database + create changeset + rebase changeset)
    for wp_name in old_wp_names:
        print("WP " + wp_name)

        # get max. fids for tables (so that we know where to start when remapping)
        db = sqlite3.connect(master_gpkg_output)
        c = db.cursor()
        new_master_fids = {}
        for wp_table in wp_config.wp_tables:
            wp_table_name = wp_table.name
            wp_table_name_escaped = escape_double_quotes(wp_table_name)
            c.execute(f"""SELECT max(fid) FROM {wp_table_name_escaped};""")
            new_master_fid = c.fetchone()[0]
            if new_master_fid is None:
                new_master_fid = 1  # empty table so far
            else:
                new_master_fid += 1
            new_master_fids[wp_table_name] = new_master_fid
        c = None
        db = None

        # TODO: check whether the changes in the DB are allowed (matching the deciding column)

        wp_gpkg_base_wp_fids = os.path.join(
            base_dir,
            wp_name + ".gpkg")  # should not have been modified by user
        wp_gpkg_input_wp_fids = os.path.join(
            input_dir, wp_name + ".gpkg")  # may have been modified by user

        wp_gpkg_base = os.path.join(
            tmp_dir,
            wp_name + "-base.gpkg")  # should not have been modified by user
        wp_gpkg_input = os.path.join(
            tmp_dir, wp_name + "-input.gpkg")  # may have been modified by user
        shutil.copy(wp_gpkg_base_wp_fids, wp_gpkg_base)
        shutil.copy(wp_gpkg_input_wp_fids, wp_gpkg_input)

        # re-map local fids of the WP gpkg to master fids (based on previously created mapping DB)
        for x in [wp_gpkg_base, wp_gpkg_input]:

            db = sqlite3.connect(x)
            db.enable_load_extension(True)  # for spatialite
            c = db.cursor()
            c.execute("SELECT load_extension('mod_spatialite');"
                      )  # TODO: how to deal with it?
            c.execute("ATTACH ? AS remap", (remap_db_output, ))
            c.execute("BEGIN")
            for wp_table in wp_config.wp_tables:
                remap_table_wp_to_master(c, wp_table.name, wp_name,
                                         new_master_fids[wp_table.name])
            c.execute("COMMIT")

        wp_changeset_base_input = os.path.join(tmp_dir,
                                               wp_name + "-base-input.diff")
        wp_changeset_base_input_json = os.path.join(
            tmp_dir, wp_name + "-base-input.json")
        wp_changeset_conflicts = os.path.join(tmp_dir,
                                              wp_name + "-conflicts.json")

        # create changeset using pygeodiff using wp_gpkg_base + wp_gpkg_input
        # print("--- create changeset")
        geodiff.create_changeset(wp_gpkg_base, wp_gpkg_input,
                                 wp_changeset_base_input)

        # summarize changes that have happened in master (base master VS input master)
        # (this is not needed anywhere in the code, but may be useful for debugging)
        geodiff.list_changes(wp_changeset_base_input,
                             wp_changeset_base_input_json)

        # TODO: the following code (copy DB + rebase + copy DB) is a bit stupid...
        # we should use GEODIFF_createRebasedChangesetEx and then just apply rebased changeset
        # but this function is not (yet) available in pygeodiff

        # create tmp_master_with_wp
        # print("--- copy + apply changeset")
        tmp_master_with_wp = os.path.join(tmp_dir,
                                          "master-" + wp_name + ".gpkg")
        shutil.copy(master_gpkg_base, tmp_master_with_wp)
        geodiff.apply_changeset(tmp_master_with_wp, wp_changeset_base_input)

        # rebase changeset - to resolve conflicts, for example:
        # - WP1 deleted a row that WP2 also wants to delete
        # - WP1 updated a row that WP2 also updated
        # - WP1 updated a row that WP2 deleted
        # - WP1 deleted a row that WP2 updated
        # - WP1 inserted a row with FID that WP2 also wants to insert -- this should not happen
        #   because remapping should assign unique master FIDs
        # print("--- rebase")
        geodiff.rebase(master_gpkg_base, master_gpkg_output,
                       tmp_master_with_wp, wp_changeset_conflicts)

        # the tmp_master_with_wp now contains stuff from output master and WP changes on top of that
        # let's overwrite the output master with this addition :-O
        # print("--- copy 2")
        shutil.copy(tmp_master_with_wp, master_gpkg_output)

    # summarize changes that have happened in WPs (input master VS output master)
    # (this is not needed anywhere in the code, but may be useful for debugging)
    master_input_to_output = os.path.join(output_dir,
                                          "master-input-output.diff")
    master_input_to_output_json = os.path.join(output_dir,
                                               "master-input-output.json")
    geodiff.create_changeset(master_gpkg_input, master_gpkg_output,
                             master_input_to_output)
    geodiff.list_changes(master_input_to_output, master_input_to_output_json)

    if os.path.exists(master_gpkg_base):
        # summarize all the changes that have happened since last run (collated master changes + wp changes)
        # (this is not needed anywhere in the code, but may be useful for debugging)
        master_base_to_output = os.path.join(output_dir,
                                             "master-base-output.diff")
        master_base_to_output_json = os.path.join(output_dir,
                                                  "master-base-output.json")
        geodiff.create_changeset(master_gpkg_base, master_gpkg_output,
                                 master_base_to_output)
        geodiff.list_changes(master_base_to_output, master_base_to_output_json)

    # STAGE 2: Regenerate WP databases
    # (make "new" WP database + filter database based on WP + remap DB)

    for wp in wp_config.wp_names:
        wp_name, wp_value, wp_mergin_project = wp.name, wp.value, wp.mergin_project
        wp_gpkg_base = os.path.join(
            base_dir,
            wp_name + ".gpkg")  # should not have been modified by user
        wp_gpkg_input = os.path.join(input_dir, wp_name +
                                     ".gpkg")  # may have been modified by user
        wp_gpkg_output = os.path.join(output_dir,
                                      wp_name + ".gpkg")  # does not exist yet
        wp_changeset_input_to_output = os.path.join(
            output_dir, wp_name + "-input-output.diff")
        wp_changeset_input_to_output_json = os.path.join(
            output_dir, wp_name + "-input-output.json")

        # start from a copy of the master
        shutil.copy(master_gpkg_output, wp_gpkg_output)

        # filter out data that does not belong to the WP
        # and remap fids in the DB from master to WP-local fids
        db = sqlite3.connect(os.path.join(output_dir, wp_name + ".gpkg"))
        db.enable_load_extension(True)  # for spatialite
        c = db.cursor()
        c.execute("SELECT load_extension('mod_spatialite');"
                  )  # TODO: how to deal with it?
        c.execute("ATTACH ? AS remap", (remap_db_output, ))
        c.execute("BEGIN")
        for wp_table in wp_config.wp_tables:
            wp_table_name = wp_table.name
            wp_table_name_escaped = escape_double_quotes(wp_table_name)
            wp_filter_column = wp_table.filter_column_name
            wp_filter_column_escaped = escape_double_quotes(wp_filter_column)
            c.execute(
                f"""delete from {wp_table_name_escaped} where {wp_filter_column_escaped} IS NULL"""
            )
            if isinstance(wp_value, (str, int, float)):
                c.execute(
                    f"""delete from {wp_table_name_escaped} where {wp_filter_column_escaped} != ?""",
                    (wp_value, ))
            elif isinstance(wp_value, list):
                values_str = ",".join(["?"] * len(wp_value))
                c.execute(
                    f"""delete from {wp_table_name_escaped} where {wp_filter_column_escaped} not in ({values_str})""",
                    wp_value,
                )
            else:
                # we may want to support some custom SQL at some point too
                raise ValueError("what?")
            remap_table_master_to_wp(c, wp_table.name, wp_name)
        # TODO: drop tables that are not listed at all (?)
        c.execute("COMMIT")

        # run VACUUM to purge anything that does not belong to the WP data
        c.execute("VACUUM")

        # get changeset between the one received from WP and newly created GPKG
        if os.path.exists(wp_gpkg_input):
            geodiff.create_changeset(wp_gpkg_input, wp_gpkg_output,
                                     wp_changeset_input_to_output)
            geodiff.list_changes(wp_changeset_input_to_output,
                                 wp_changeset_input_to_output_json)
        else:
            # first time this WP is created...
            pass  # TODO: what to do?
Exemple #6
0
def remap_table_name(table_name, wp_name):
    """ Returns name of the mapping table used for a particular table name and work package """

    wp_table_name = f"{table_name}_{wp_name}"
    wp_table_name_escaped = escape_double_quotes(wp_table_name)
    return f'"remap".{wp_table_name_escaped}'