def query_osm(changeset_ids: list, changeset_results):
    """Get data from changesetId."""
    id_string = ",".join(map(str, changeset_ids))

    url = OSM_API_LINK + f"changesets?changesets={id_string}"
    response = retry_get(url)
    if response.status_code != 200:
        err = f"osm request failed: {response.status_code}"
        logger.warning(f"{err}")
        logger.warning(response.json())
        raise CustomError(err)
    tree = ElementTree.fromstring(response.content)

    for changeset in tree.iter("changeset"):
        id = changeset.attrib["id"]
        username = remove_troublesome_chars(changeset.attrib["user"])
        userid = changeset.attrib["uid"]
        comment = created_by = None
        for tag in changeset.iter("tag"):
            if tag.attrib["k"] == "comment":
                comment = tag.attrib["v"]
            if tag.attrib["k"] == "created_by":
                created_by = tag.attrib["v"]

        changeset_results[int(id)] = {
            "username": remove_troublesome_chars(username),
            "userid": userid,
            "comment": remove_troublesome_chars(comment),
            "editor": remove_troublesome_chars(created_by),
        }
    return changeset_results
def update_groups_table(project_id: str):
    """Remove duplicates in 'project_types_specifics' attribute in groups table."""

    logger.info(f"Start process for project: '{project_id}'")
    p_con = auth.postgresDB()

    query = """
        UPDATE groups
        SET project_type_specifics = project_type_specifics::jsonb
            #- '{projectId}'
            #- '{id}'
            #- '{requiredCount}'
            #- '{finishedCount}'
            #- '{neededCount}'
            #- '{reportCount}'
            #- '{distributedCount}'
        WHERE project_id = %(project_id)s
    """
    try:
        p_con.query(query, {"project_id": project_id})
        logger.info(f"Updated tasks table for project '{project_id}'.")
    except Exception as e:
        sentry.capture_exception(e)
        sentry.capture_message(
            f"Could NOT update tasks table for project '{project_id}'.")
        logger.exception(e)
        logger.warning(
            f"Could NOT update tasks table for project '{project_id}'.")
Beispiel #3
0
def remove_timestamps(uid):
    """Remove timestamps from user contributions in Firebase."""
    fb_db = firebaseDB()
    try:
        ref = fb_db.reference(f"v2/users/{uid}/contributions")
        user_contributions = ref.get()

        if not user_contributions:
            logger.info(f"user {uid} has no contributions in firebase.")
        else:
            for project_id in user_contributions.keys():
                for key in user_contributions[project_id].keys():
                    if key == "taskContributionCount":
                        continue
                    user_contributions[project_id][key] = True

            ref.update(user_contributions)
            logger.info(f"updated user contributions for user {uid}")
    except ValueError:
        logger.warning(
            f"could not remove timestamps for user {uid} in firebase.")
def query_osmcha(changeset_ids: list, changeset_results):
    """Get data from changesetId."""
    id_string = ",".join(map(str, changeset_ids))

    url = OSMCHA_API_LINK + f"changesets/?ids={id_string}"
    response = retry_get(url, to_osmcha=True)
    if response.status_code != 200:
        err = f"osmcha request failed: {response.status_code}"
        logger.warning(f"{err}")
        logger.warning(response.json())
        raise CustomError(err)
    response = response.json()
    for feature in response["features"]:
        changeset_results[int(feature["id"])] = {
            "username":
            remove_troublesome_chars(feature["properties"]["user"]),
            "userid": feature["properties"]["uid"],
            "comment":
            remove_troublesome_chars(feature["properties"]["comment"]),
            "editor":
            remove_troublesome_chars(feature["properties"]["editor"]),
        }

    return changeset_results
def ohsome(request: dict, area: str, properties=None) -> dict:
    """Request data from Ohsome API."""
    url = OHSOME_API_LINK + request["endpoint"]
    data = {"bpolys": area, "filter": request["filter"]}
    if properties:
        data["properties"] = properties
    logger.info("Target: " + url)
    logger.info("Filter: " + request["filter"])
    response = requests.post(url, data=data)
    if response.status_code != 200:
        err = f"ohsome request failed: {response.status_code}"
        logger.warning(
            f"{err} - check for errors in filter or geometries - {request['filter']}"
        )
        logger.warning(response.json())
        raise CustomError(err)
    else:
        logger.info("Query succesfull.")

    response = response.json()

    if properties:
        response = remove_noise_and_add_user_info(response)
    return response
def results_to_file(results, projectId):
    """
    Writes results to an in-memory file like object
    formatted as a csv using the buffer module (StringIO).
    This can be then used by the COPY statement of Postgres
    for a more efficient import of many results into the Postgres
    instance.
    Parameters
    ----------
    results: dict
        The results as retrived from the Firebase Realtime Database instance.
    Returns
    -------
    results_file: io.StingIO
        The results in an StringIO buffer.
    """
    # If csv file is a file object, it should be opened with newline=''

    results_file = io.StringIO("")

    w = csv.writer(results_file, delimiter="\t", quotechar="'")

    logger.info(f"Got %s groups for project {projectId} to transfer" %
                len(results.items()))
    for groupId, users in results.items():
        for userId, results in users.items():

            # check if all attributes are set,
            # if not don't transfer the results for this group
            try:
                start_time = results["startTime"]
                end_time = results["endTime"]
                results = results["results"]
            except KeyError as e:
                sentry.capture_exception(e)
                sentry.capture_message(
                    f"at least one missing attribute for: "
                    f"{projectId}/{groupId}/{userId}, will skip this one")
                logger.exception(e)
                logger.warning(
                    f"at least one missing attribute for: "
                    f"{projectId}/{groupId}/{userId}, will skip this one")
                continue

            start_time = dateutil.parser.parse(start_time)
            end_time = dateutil.parser.parse(end_time)
            timestamp = end_time

            if type(results) is dict:
                for taskId, result in results.items():
                    w.writerow([
                        projectId,
                        groupId,
                        userId,
                        taskId,
                        timestamp,
                        start_time,
                        end_time,
                        result,
                    ])
            elif type(results) is list:
                # TODO: optimize for performance
                # (make sure data from firebase is always a dict)
                # if key is a integer firebase will return a list
                # if first key (list index) is 5
                # list indicies 0-4 will have value None
                for taskId, result in enumerate(results):
                    if result is None:
                        continue
                    else:
                        w.writerow([
                            projectId,
                            groupId,
                            userId,
                            taskId,
                            timestamp,
                            start_time,
                            end_time,
                            result,
                        ])
            else:
                raise TypeError

    results_file.seek(0)
    return results_file
Beispiel #7
0
    def validate_geometries(self):
        raw_input_file = (
            f"{DATA_PATH}/"
            f"input_geometries/raw_input_{self.projectId}.geojson")
        valid_input_file = (
            f"{DATA_PATH}/"
            f"input_geometries/valid_input_{self.projectId}.geojson")

        if not os.path.isdir("{}/input_geometries".format(DATA_PATH)):
            os.mkdir("{}/input_geometries".format(DATA_PATH))

        # download file from given url
        url = self.inputGeometries
        urllib.request.urlretrieve(url, raw_input_file)
        logger.info(f"{self.projectId}"
                    f" - __init__ - "
                    f"downloaded input geometries from url and saved as file: "
                    f"{raw_input_file}")
        self.inputGeometries = raw_input_file

        # open the raw input file and get layer
        driver = ogr.GetDriverByName("GeoJSON")
        datasource = driver.Open(raw_input_file, 0)
        try:
            layer = datasource.GetLayer()
            LayerDefn = layer.GetLayerDefn()
        except AttributeError:
            raise CustomError("Value error in input geometries file")

        # create layer for valid_input_file to store all valid geometries
        outDriver = ogr.GetDriverByName("GeoJSON")
        # Remove output geojson if it already exists
        if os.path.exists(valid_input_file):
            outDriver.DeleteDataSource(valid_input_file)
        outDataSource = outDriver.CreateDataSource(valid_input_file)
        outLayer = outDataSource.CreateLayer("geometries",
                                             geom_type=ogr.wkbMultiPolygon)
        for i in range(0, LayerDefn.GetFieldCount()):
            fieldDefn = LayerDefn.GetFieldDefn(i)
            outLayer.CreateField(fieldDefn)
        outLayerDefn = outLayer.GetLayerDefn()

        # check if raw_input_file layer is empty
        if layer.GetFeatureCount() < 1:
            err = "empty file. No geometries provided"
            # TODO: How to user logger and exceptions?
            logger.warning(f"{self.projectId} - check_input_geometry - {err}")
            raise Exception(err)

        # get geometry as wkt
        # get the bounding box/ extent of the layer
        extent = layer.GetExtent()
        # Create a Polygon from the extent tuple
        ring = ogr.Geometry(ogr.wkbLinearRing)
        ring.AddPoint(extent[0], extent[2])
        ring.AddPoint(extent[1], extent[2])
        ring.AddPoint(extent[1], extent[3])
        ring.AddPoint(extent[0], extent[3])
        ring.AddPoint(extent[0], extent[2])
        poly = ogr.Geometry(ogr.wkbPolygon)
        poly.AddGeometry(ring)
        wkt_geometry = poly.ExportToWkt()

        # check if the input geometry is a valid polygon
        for feature in layer:
            feat_geom = feature.GetGeometryRef()
            geom_name = feat_geom.GetGeometryName()
            fid = feature.GetFID
            if not feat_geom.IsValid():
                layer.DeleteFeature(fid)
                logger.warning(f"{self.projectId}"
                               f" - check_input_geometries - "
                               f"deleted invalid feature {fid}")

            # we accept only POLYGON or MULTIPOLYGON geometries
            elif geom_name != "POLYGON" and geom_name != "MULTIPOLYGON":
                layer.DeleteFeature(fid)
                logger.warning(f"{self.projectId}"
                               f" - check_input_geometries - "
                               f"deleted non polygon feature {fid}")

            else:
                # Create output Feature
                outFeature = ogr.Feature(outLayerDefn)
                # Add field values from input Layer
                for i in range(0, outLayerDefn.GetFieldCount()):
                    outFeature.SetField(
                        outLayerDefn.GetFieldDefn(i).GetNameRef(),
                        feature.GetField(i))
                outFeature.SetGeometry(feat_geom)
                outLayer.CreateFeature(outFeature)
                outFeature = None

        # check if layer is empty
        if layer.GetFeatureCount() < 1:
            err = "no geometries left after checking validity and geometry type."
            logger.warning(f"{self.projectId} - check_input_geometry - {err}")
            raise Exception(err)

        del datasource
        del outDataSource
        del layer

        self.validInputGeometries = valid_input_file

        logger.info(f"{self.projectId}"
                    f" - check_input_geometry - "
                    f"filtered correct input geometries and created file: "
                    f"{valid_input_file}")
        return wkt_geometry
    def validate_geometries(self):
        raw_input_file = (f"{DATA_PATH}/input_geometries/"
                          f"raw_input_{self.projectId}.geojson")
        # check if a 'data' folder exists and create one if not
        if not os.path.isdir("{}/input_geometries".format(DATA_PATH)):
            os.mkdir("{}/input_geometries".format(DATA_PATH))

        # write string to geom file
        with open(raw_input_file, "w") as geom_file:
            json.dump(self.geometry, geom_file)

        driver = ogr.GetDriverByName("GeoJSON")
        datasource = driver.Open(raw_input_file, 0)

        try:
            layer = datasource.GetLayer()
        except AttributeError:
            logger.warning(f"{self.projectId}"
                           f" - validate geometry - "
                           f"Could not get layer for datasource")
            raise CustomError("could not get layer for datasource")

        # check if layer is empty
        if layer.GetFeatureCount() < 1:
            logger.warning(f"{self.projectId}"
                           f" - validate geometry - "
                           f"Empty file. "
                           f"No geometry is provided.")
            raise CustomError("Empty file. ")

        # check if more than 1 geometry is provided
        elif layer.GetFeatureCount() > MAX_INPUT_GEOMETRIES:
            logger.warning(
                f"{self.projectId}"
                f" - validate geometry - "
                f"Input file contains more than {MAX_INPUT_GEOMETRIES} geometries. "
                f"Make sure to provide less than {MAX_INPUT_GEOMETRIES} geometries."
            )
            raise CustomError(
                f"Input file contains more than {MAX_INPUT_GEOMETRIES} geometries. "
            )

        project_area = 0
        geometry_collection = ogr.Geometry(ogr.wkbMultiPolygon)
        # check if the input geometry is a valid polygon
        for feature in layer:
            feat_geom = feature.GetGeometryRef()
            geom_name = feat_geom.GetGeometryName()
            # add geometry to geometry collection
            if geom_name == "MULTIPOLYGON":
                for singlepart_polygon in feat_geom:
                    geometry_collection.AddGeometry(singlepart_polygon)
            if geom_name == "POLYGON":
                geometry_collection.AddGeometry(feat_geom)
            if not feat_geom.IsValid():
                logger.warning(f"{self.projectId}"
                               f" - validate geometry - "
                               f"Geometry is not valid: {geom_name}. "
                               f"Tested with IsValid() ogr method. "
                               f"Probably self-intersections.")
                raise CustomError(f"Geometry is not valid: {geom_name}. ")

            # we accept only POLYGON or MULTIPOLYGON geometries
            if geom_name != "POLYGON" and geom_name != "MULTIPOLYGON":
                logger.warning(f"{self.projectId}"
                               f" - validate geometry - "
                               f"Invalid geometry type: {geom_name}. "
                               f'Please provide "POLYGON" or "MULTIPOLYGON"')
                raise CustomError(f"Invalid geometry type: {geom_name}. ")

            # check size of project make sure its smaller than  5,000 sqkm
            # for doing this we transform the geometry
            # into Mollweide projection (EPSG Code 54009)
            source = feat_geom.GetSpatialReference()
            target = osr.SpatialReference()
            target.ImportFromProj4(
                "+proj=moll +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
            )

            transform = osr.CoordinateTransformation(source, target)
            feat_geom.Transform(transform)
            project_area = +feat_geom.GetArea() / 1000000

        # calculate max area based on zoom level
        # for zoom level 18 this will be 5000 square kilometers
        # max zoom level is 22
        if self.zoomLevel > 22:
            raise CustomError(
                f"zoom level is to large (max: 22): {self.zoomLevel}.")

        max_area = (23 - int(self.zoomLevel)) * (23 -
                                                 int(self.zoomLevel)) * 200

        if project_area > max_area:
            logger.warning(
                f"{self.projectId}"
                f" - validate geometry - "
                f"Project is to large: {project_area} sqkm. "
                f"Please split your projects into smaller sub-projects and resubmit"
            )
            raise CustomError(
                f"Project is to large: {project_area} sqkm. "
                f"Max area for zoom level {self.zoomLevel} = {max_area} sqkm")

        del datasource
        del layer

        self.validInputGeometries = raw_input_file
        logger.info(f"{self.projectId}"
                    f" - validate geometry - "
                    f"input geometry is correct.")

        dissolved_geometry = geometry_collection.UnionCascaded()
        wkt_geometry_collection = dissolved_geometry.ExportToWkt()

        return wkt_geometry_collection
def remove_noise_and_add_user_info(json: dict) -> dict:
    """Delete unwanted information from properties."""
    logger.info("starting filtering and adding extra info")
    batch_size = 100

    # remove noise
    changeset_results = {}

    missing_rows = {
        "@changesetId": 0,
        "@lastEdit": 0,
        "@osmId": 0,
        "@version": 0,
    }

    for feature in json["features"]:
        new_properties = {}
        for attribute in missing_rows.keys():
            try:
                new_properties[attribute.replace(
                    "@", "")] = feature["properties"][attribute]
            except KeyError:
                missing_rows[attribute] += 1
        changeset_results[new_properties["changesetId"]] = None
        feature["properties"] = new_properties

    # add info
    len_osm = len(changeset_results.keys())
    batches = int(len(changeset_results.keys()) / batch_size) + 1
    logger.info(
        f"""{len_osm} changesets will be queried in roughly {batches} batches from osmCHA"""  # noqa E501
    )

    chunk_list = chunks(list(changeset_results.keys()), batch_size)
    for i, subset in enumerate(chunk_list):
        changeset_results = query_osmcha(subset, changeset_results)
        progress = round(100 * ((i + 1) / len(chunk_list)), 1)
        logger.info(f"finished query {i+1}/{len(chunk_list)}, {progress}")

    missing_ids = [i for i, v in changeset_results.items() if v is None]
    chunk_list = chunks(missing_ids, batch_size)
    batches = int(len(missing_ids) / batch_size) + 1
    logger.info(
        f"""{len(missing_ids)} changesets where missing from osmCHA and are now queried via osmAPI in {batches} batches"""  # noqa E501
    )
    for i, subset in enumerate(chunk_list):
        changeset_results = query_osm(subset, changeset_results)
        progress = round(100 * ((i + 1) / len(chunk_list)), 1)
        logger.info(f"finished query {i+1}/{len(chunk_list)}, {progress}")

    for feature in json["features"]:
        changeset = changeset_results[int(
            feature["properties"]["changesetId"])]
        for attribute_name in ["username", "comment", "editor", "userid"]:
            if attribute_name == "userid":
                feature["properties"][attribute_name] = int(
                    changeset[attribute_name])
            else:
                feature["properties"][attribute_name] = changeset[
                    attribute_name]

    logger.info("finished filtering and adding extra info")
    if any(x > 0 for x in missing_rows.values()):
        logger.warning(f"features missing values:\n{missing_rows}")

    return json
def transfer_results_for_project(project_id, results, filter_mode: bool = False):
    """Transfer the results for a specific project.
    Save results into an in-memory file.
    Copy the results to postgres.
    Delete results in firebase.
    We are NOT using a Firebase transaction functions here anymore.
    This has caused problems, in situations where a lot of mappers are
    uploading results to Firebase at the same time. Basically, this is
    due to the behaviour of Firebase Transaction function:
        "If another client writes to this location
        before the new value is successfully saved,
        the update function is called again with the new current value,
        and the write will be retried."
    (source: https://firebase.google.com/docs/reference/admin/python/firebase_admin.db#firebase_admin.db.Reference.transaction)  # noqa
    Using Firebase transaction on the group level
    has turned out to be too slow when using "normal" queries,
    e.g. without using threading. Threading should be avoided here
    as well to not run into unforeseen errors.
    For more details see issue #478.
    """

    if results is None:
        logger.info(f"{project_id}: No results in Firebase")
    else:
        # First we check for new users in Firebase.
        # The user_id is used as a key in the postgres database for the results
        # and thus users need to be inserted before results get inserted.
        results_user_id_list = get_user_ids_from_results(results)
        update_data.update_user_data(results_user_id_list)

    try:
        # Results are dumped into an in-memory file.
        # This allows us to use the COPY statement to insert many
        # results at relatively high speed.
        results_file = results_to_file(results, project_id)
        truncate_temp_results()
        save_results_to_postgres(results_file, project_id, filter_mode=filter_mode)
    except psycopg2.errors.ForeignKeyViolation as e:

        sentry.capture_exception(e)
        sentry.capture_message(
            "could not transfer results to postgres due to ForeignKeyViolation: "
            f"{project_id}; filter_mode={filter_mode}"
        )
        logger.exception(e)
        logger.warning(
            "could not transfer results to postgres due to ForeignKeyViolation: "
            f"{project_id}; filter_mode={filter_mode}"
        )

        # There is an exception where additional invalid tasks are in a group.
        # If that happens we arrive here and add the flag filtermode=true
        # to this function, which could solve the issue in save_results_to_postgres.
        # If it does not solve the issue we arrive again but
        # since filtermode is already true, we will not try to transfer results again.
        if not filter_mode:
            transfer_results_for_project(project_id, results, filter_mode=True)
    except Exception as e:
        sentry.capture_exception(e)
        sentry.capture_message(f"could not transfer results to postgres: {project_id}")
        logger.exception(e)
        logger.warning(f"could not transfer results to postgres: {project_id}")
    else:
        # It is important here that we first insert results into postgres
        # and then delete these results from Firebase.
        # In case something goes wrong during the insert, results in Firebase
        # will not get deleted.
        delete_results_from_firebase(project_id, results)
        logger.info(f"{project_id}: Transferred results to postgres")