Ejemplo n.º 1
0
    def _handle_esri_errors(self, response, error_message):
        if response.status_code != 200:
            raise EsriDownloadError('{}: {} HTTP {} {}'.format(
                response.request.url,
                error_message,
                response.status_code,
                response.text,
            ))

        try:
            data = response.json()
        except:
            self._logger.error(
                "Could not parse response from {} as JSON:\n\n{}".format(
                    response.request.url,
                    response.text,
                ))
            raise

        error = data.get('error')
        if error:
            raise EsriDownloadError("{}: {} {}".format(
                error_message,
                error['message'],
                ', '.join(error['details']),
            ))

        return data
Ejemplo n.º 2
0
    def test_download_handles_no_count(self):
        """ ESRI Caching Will Handle A Server Without returnCountOnly Support """
        task = EsriRestDownloadTask('us-fl-palmbeach')

        with patch('esridump.EsriDumper.get_metadata') as metadata_patch:
            metadata_patch.return_value = {'fields': []}
            with patch(
                    'esridump.EsriDumper.get_feature_count') as feature_patch:
                feature_patch.side_effect = EsriDownloadError(
                    "Server doesn't support returnCountOnly")
                with self.assertRaises(EsriDownloadError) as e:
                    task.download(['http://example.com/'], self.workdir,
                                  SourceConfig(
                                      dict({
                                          "schema": 2,
                                          "layers": {
                                              "addresses": [{
                                                  "name": "default",
                                                  "conform": {
                                                      "number": "num",
                                                      "street": "str"
                                                  }
                                              }]
                                          }
                                      }), "addresses", "default"))

                    # This is the expected exception at this point
                    self.assertEqual(
                        e.message,
                        "Could not find object ID field name for deduplication"
                    )
Ejemplo n.º 3
0
    def test_download_handles_no_count(self):
        """ ESRI Caching Will Handle A Server Without returnCountOnly Support """
        task = EsriRestDownloadTask('us-fl-palmbeach')

        with patch('esridump.EsriDumper.get_metadata') as metadata_patch:
            metadata_patch.return_value = {'fields': []}
            with patch('esridump.EsriDumper.get_feature_count') as feature_patch:
                feature_patch.side_effect = EsriDownloadError("Server doesn't support returnCountOnly")
                with self.assertRaises(EsriDownloadError) as e:
                    task.download(['http://example.com/'], self.workdir)

                    # This is the expected exception at this point
                    self.assertEqual(e.message, "Could not find object ID field name for deduplication")
Ejemplo n.º 4
0
 def _get_layer_oids(self):
     query_args = self._build_query_args({
         'where': '1=1',  # So we get everything
         'returnIdsOnly': 'true',
         'f': 'json',
     })
     url = self._build_url('/query')
     headers = self._build_headers()
     response = self._request('GET', url, params=query_args, headers=headers)
     oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs")
     oids = oid_data.get('objectIds')
     if not oids:
         raise EsriDownloadError("Server doesn't support returnIdsOnly")
     return oids
Ejemplo n.º 5
0
 def get_feature_count(self):
     query_args = self._build_query_args({
         'where': '1=1',
         'returnCountOnly': 'true',
         'f': 'json',
     })
     headers = self._build_headers()
     url = self._build_url('/query')
     response = self._request('GET', url, params=query_args, headers=headers)
     count_json = self._handle_esri_errors(response, "Could not retrieve row count")
     count = count_json.get('count')
     if not count:
         raise EsriDownloadError("Server doesn't support returnCountOnly")
     return count_json['count']
Ejemplo n.º 6
0
    def _get_layer_min_max(self, oid_field_name):
        """ Find the min and max values for the OID field. """
        query_args = self._build_query_args({
            'f': 'json',
            'outFields': '',
            'outStatistics': json.dumps([
                dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
                dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
            ], separators=(',', ':'))
        })
        headers = self._build_headers()
        url = self._build_url('/query')
        response = self._request('GET', url, params=query_args, headers=headers)
        metadata = self._handle_esri_errors(response, "Could not retrieve min/max oid values")

        # Some servers (specifically version 10.11, it seems) will respond with SQL statements
        # for the attribute names rather than the requested field names, so pick the min and max
        # deliberately rather than relying on the names.
        min_max_values = metadata['features'][0]['attributes'].values()
        min_value = min(min_max_values)
        max_value = max(min_max_values)
        query_args = self._build_query_args({
            'f': 'json',
            'outFields': '*',
            'outStatistics': json.dumps([
                dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
                dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
            ], separators=(',', ':'))
        })
        query_args = self._build_query_args({
            'where': '{} = {} OR {} = {}'.format(
                oid_field_name,
                min_value,
                oid_field_name,
                max_value
            ),
            'returnIdsOnly': 'true',
            'f': 'json',
        })
        headers = self._build_headers()
        url = self._build_url('/query')
        response = self._request('GET', url, params=query_args, headers=headers)
        oid_data = self._handle_esri_errors(response, "Could not check min/max values")
        if not oid_data or not oid_data.get('objectIds') or min_value not in oid_data['objectIds'] or max_value not in oid_data['objectIds']:
            raise EsriDownloadError('Server returned invalid min/max')
        return (min_value, max_value)
 def _get_layer_oids(self):
     query_args = self._build_query_args({
         "where": "1=1",  # So we get everything
         "returnIdsOnly": "true",
         "f": "json",
     })
     url = self._build_url("/query")
     headers = self._build_headers()
     response = self._request("GET",
                              url,
                              params=query_args,
                              headers=headers)
     oid_data = self._handle_esri_errors(response,
                                         "Could not retrieve object IDs")
     oids = oid_data.get("objectIds")
     if not oids:
         raise EsriDownloadError("Server doesn't support returnIdsOnly")
     return oids
 def get_feature_count(self):
     query_args = self._build_query_args({
         "where": "1=1",
         "returnCountOnly": "true",
         "f": "json",
     })
     headers = self._build_headers()
     url = self._build_url("/query")
     response = self._request("GET",
                              url,
                              params=query_args,
                              headers=headers)
     count_json = self._handle_esri_errors(response,
                                           "Could not retrieve row count")
     count = count_json.get("count")
     if not count:
         raise EsriDownloadError("Server doesn't support returnCountOnly")
     return count_json["count"]
Ejemplo n.º 9
0
    def __iter__(self):
        query_fields = self._fields
        metadata = self.get_metadata()
        page_size = min(1000, metadata.get('maxRecordCount', 500))
        geometry_type = metadata.get('geometryType')

        row_count = None

        try:
            row_count = self.get_feature_count()
        except EsriDownloadError:
            self._logger.info("Source does not support feature count")

        page_args = []

        if row_count is not None and (metadata.get('supportsPagination') or \
                (metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
            # If the layer supports pagination, we can use resultOffset/resultRecordCount to paginate

            # There's a bug where some servers won't handle these queries in combination with a list of
            # fields specified. We'll make a single, 1 row query here to check if the server supports this
            # and switch to querying for all fields if specifying the fields fails.
            if query_fields and not self.can_handle_pagination(query_fields):
                self._logger.info(
                    "Source does not support pagination with fields specified, so querying for all fields."
                )
                query_fields = None

            for offset in range(self._startWith, row_count, page_size):
                query_args = self._build_query_args({
                    'resultOffset':
                    offset,
                    'resultRecordCount':
                    page_size,
                    'where':
                    '1=1',
                    'geometryPrecision':
                    self._precision,
                    'returnGeometry':
                    self._request_geometry,
                    'outSR':
                    self._outSR,
                    'outFields':
                    ','.join(query_fields or ['*']),
                    'f':
                    'json',
                })
                page_args.append(query_args)
            self._logger.info("Built %s requests using resultOffset method",
                              len(page_args))
        else:
            # If not, we can still use the `where` argument to paginate

            use_oids = True
            oid_field_name = self._find_oid_field_name(metadata)

            if not oid_field_name:
                raise EsriDownloadError(
                    "Could not find object ID field name for deduplication")

            if metadata.get('supportsStatistics'):
                # If the layer supports statistics, we can request maximum and minimum object ID
                # to help build the pages
                try:
                    (oid_min,
                     oid_max) = self._get_layer_min_max(oid_field_name)

                    for page_min in range(oid_min - 1, oid_max, page_size):
                        page_max = min(page_min + page_size, oid_max)
                        query_args = self._build_query_args({
                            'where':
                            '{} > {} AND {} <= {}'.format(
                                oid_field_name,
                                page_min,
                                oid_field_name,
                                page_max,
                            ),
                            'geometryPrecision':
                            self._precision,
                            'returnGeometry':
                            self._request_geometry,
                            'outSR':
                            self._outSR,
                            'outFields':
                            ','.join(query_fields or ['*']),
                            'f':
                            'json',
                        })
                        page_args.append(query_args)
                    self._logger.info(
                        "Built {} requests using OID where clause method".
                        format(len(page_args)))

                    # If we reach this point we don't need to fall through to enumerating all object IDs
                    # because the statistics method worked
                    use_oids = False
                except EsriDownloadError:
                    self._logger.exception(
                        "Finding max/min from statistics failed. Trying OID enumeration."
                    )

            if use_oids:
                # If the layer does not support statistics, we can request
                # all the individual IDs and page through them one chunk at
                # a time.

                try:
                    oids = sorted(map(int, self._get_layer_oids()))

                    for i in range(0, len(oids), page_size):
                        oid_chunk = oids[i:i + page_size]
                        page_min = oid_chunk[0]
                        page_max = oid_chunk[-1]
                        query_args = self._build_query_args({
                            'where':
                            '{} >= {} AND {} <= {}'.format(
                                oid_field_name,
                                page_min,
                                oid_field_name,
                                page_max,
                            ),
                            'geometryPrecision':
                            self._precision,
                            'returnGeometry':
                            self._request_geometry,
                            'outSR':
                            self._outSR,
                            'outFields':
                            ','.join(query_fields or ['*']),
                            'f':
                            'json',
                        })
                        page_args.append(query_args)
                    self._logger.info(
                        "Built %s requests using OID enumeration method",
                        len(page_args))
                except EsriDownloadError:
                    self._logger.info("Falling back to geo queries")
                    # Use geospatial queries when none of the ID-based methods will work
                    bounds = metadata['extent']
                    saved = set()

                    for feature in self._scrape_an_envelope(
                            bounds, self._outSR, page_size):
                        attrs = feature['attributes']
                        oid = attrs.get(oid_field_name)
                        if oid in saved:
                            continue

                        yield esri2geojson(feature)

                        saved.add(oid)

                    return

        query_url = self._build_url('/query')
        headers = self._build_headers()
        for query_args in page_args:
            try:
                response = self._request('POST',
                                         query_url,
                                         headers=headers,
                                         data=query_args)
                data = self._handle_esri_errors(
                    response, "Could not retrieve this chunk of objects")
            except socket.timeout as e:
                raise EsriDownloadError("Timeout when connecting to URL", e)
            except ValueError as e:
                raise EsriDownloadError("Could not parse JSON", e)
            except Exception as e:
                raise EsriDownloadError("Could not connect to URL", e)

            error = data.get('error')
            if error:
                raise EsriDownloadError(
                    "Problem querying ESRI dataset with args {}. Server said: {}"
                    .format(query_args, error['message']))

            features = data.get('features')

            for feature in features:
                yield esri2geojson(feature)
    def _get_layer_min_max(self, oid_field_name):
        """ Find the min and max values for the OID field. """
        query_args = self._build_query_args({
            "f":
            "json",
            "outFields":
            "",
            "outStatistics":
            json.dumps(
                [
                    dict(
                        statisticType="min",
                        onStatisticField=oid_field_name,
                        outStatisticFieldName="THE_MIN",
                    ),
                    dict(
                        statisticType="max",
                        onStatisticField=oid_field_name,
                        outStatisticFieldName="THE_MAX",
                    ),
                ],
                separators=(",", ":"),
            ),
        })
        headers = self._build_headers()
        url = self._build_url("/query")
        response = self._request("GET",
                                 url,
                                 params=query_args,
                                 headers=headers)
        metadata = self._handle_esri_errors(
            response, "Could not retrieve min/max oid values")

        # Some servers (specifically version 10.11, it seems) will respond with SQL statements
        # for the attribute names rather than the requested field names, so pick the min and max
        # deliberately rather than relying on the names.
        min_max_values = metadata["features"][0]["attributes"].values()
        min_value = min(min_max_values)
        max_value = max(min_max_values)
        query_args = self._build_query_args({
            "f":
            "json",
            "outFields":
            "*",
            "outStatistics":
            json.dumps(
                [
                    dict(
                        statisticType="min",
                        onStatisticField=oid_field_name,
                        outStatisticFieldName="THE_MIN",
                    ),
                    dict(
                        statisticType="max",
                        onStatisticField=oid_field_name,
                        outStatisticFieldName="THE_MAX",
                    ),
                ],
                separators=(",", ":"),
            ),
        })
        query_args = self._build_query_args({
            "where":
            "{} = {} OR {} = {}".format(oid_field_name, min_value,
                                        oid_field_name, max_value),
            "returnIdsOnly":
            "true",
            "f":
            "json",
        })
        headers = self._build_headers()
        url = self._build_url("/query")
        response = self._request("GET",
                                 url,
                                 params=query_args,
                                 headers=headers)
        oid_data = self._handle_esri_errors(response,
                                            "Could not check min/max values")
        if (not oid_data or not oid_data.get("objectIds")
                or min_value not in oid_data["objectIds"]
                or max_value not in oid_data["objectIds"]):
            raise EsriDownloadError("Server returned invalid min/max")
        return (min_value, max_value)