Exemple #1
0
def fetch_resource(url, tmpfile, api_key):
    response = requests.get(
        url,
        headers={"Authorization": api_key},
        timeout=DOWNLOAD_TIMEOUT,
        verify=SSL_VERIFY,
        stream=True,  # just gets the headers for now
    )
    response.raise_for_status()

    cl = response.headers.get("content-length")
    try:
        if cl and int(cl) > MAX_CONTENT_LENGTH:
            raise util.JobError(
                "Resource too large to download: {cl} > max ({max_cl}).".
                format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
    except ValueError:
        pass

    length = 0
    for chunk in response.iter_content(CHUNK_SIZE):
        length += len(chunk)
        if length > MAX_CONTENT_LENGTH:
            raise util.JobError(
                "Resource too large to process: {cl} > max ({max_cl}).".format(
                    cl=length, max_cl=MAX_CONTENT_LENGTH))
        tmpfile.write(chunk)

    tmpfile.seek(0)
Exemple #2
0
def scan(task_id, payload):
    logger = init_logger(task_id, payload)
    logger.info(f"Starting job {task_id}")

    validate_payload(payload)

    data = payload["metadata"]
    ckan_url = data["ckan_url"]
    resource_id = data["resource_id"]
    api_key = payload.get("api_key")

    scan_result = scan_resource(logger, ckan_url, api_key, resource_id)

    response = {
        "status_code": scan_result.returncode,
        "description": scan_result.stdout.decode("utf-8"),
    }
    if scan_result.returncode not in STATUSES:
        raise util.JobError(json.dumps(response))
    response["status_text"] = STATUSES[scan_result.returncode]
    if scan_result.returncode == 2:
        raise util.JobError(json.dumps(response))
    logger.info(
        f"Completed scanning resource {resource_id}. Submitting result")
    return response
Exemple #3
0
def convert(file, logger):

    file.seek(0)
    outfile = tempfile.TemporaryFile()
    wrapper_file = codecs.getwriter('utf-8')(outfile)
    events = map(floaten, ijson.parse(file))
    features = ijson.common.items(events, 'features.item')
    writer = False

    for feature in features:
        try:
            if not writer:
                fieldnames = list(feature['properties'].keys())
                writer = csv.DictWriter(wrapper_file,
                                        fieldnames=fieldnames,
                                        lineterminator=os.linesep)
                writer.writeheader()
            row = feature['properties']
            writer.writerow(row)

        except KeyError as e:
            logger.exception(e)
            raise util.JobError(
                "GeoJSON feature must have a 'properties' field.")
        except ValueError as e:
            logger.exception(e)
            raise util.JobError(
                "Each GeoJSON feature must have the same properties in order to convert to table. "
            )

    if not outfile.tell():
        raise util.JobError("No valid features found in the GeoJSON")

    outfile.seek(0)
    return outfile
Exemple #4
0
def scan_resource(logger, ckan_url, api_key, resource_id):
    try:
        resource = ckan_action("resource_show", ckan_url, api_key,
                               {"id": resource_id})
    except util.JobError:
        # try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = ckan_action("resource_show", ckan_url, api_key,
                               {"id": resource_id})

    url_type = resource.get("url_type")
    if url_type != "upload":
        raise util.JobError(
            f"Only resources of type 'upload' can be scanned. Received '{str(url_type)}'"
        )

    url = resource.get("url")
    scheme = urlsplit(url).scheme
    if scheme not in ("http", "https", "ftp"):
        raise util.JobError(
            "Only http, https, and ftp resources may be fetched.")

    logger.info(f"Fetching from {url}")
    with tempfile.NamedTemporaryFile() as tmp:
        try:
            fetch_resource(url, tmp, api_key)
        except RequestException as e:
            raise util.JobError(str(e))
        logger.info(f"Scanning {tmp.name}")
        try:
            scan_result = scan_file(tmp.name)
        except (subprocess.SubprocessError, subprocess.TimeoutExpired) as e:
            raise util.JobError(str(e))

    return scan_result
Exemple #5
0
def validate_payload(payload):
    if "metadata" not in payload:
        raise util.JobError("Metadata missing")

    metadata = payload["metadata"]

    if "resource_id" not in metadata:
        raise util.JobError("No id provided.")
    if "ckan_url" not in metadata:
        raise util.JobError("No ckan_url provided.")
    if not payload.get("api_key"):
        raise util.JobError("No CKAN API key provided")
Exemple #6
0
def validate_input(input):
    # Especially validate metdata which is provided by the user
    if not 'metadata' in input:
        raise util.JobError('Metadata missing')

    data = input['metadata']

    if not 'resource_id' in data:
        raise util.JobError('No id provided.')
    if not 'ckan_url' in data:
        raise util.JobError('No ckan_url provided.')
    if not input.get('api_key'):
        raise util.JobError('No CKAN API key provided')
Exemple #7
0
def datastore_resource_exists(resource_id, api_key, ckan_url):
    try:
        search_url = get_url('datastore_search', ckan_url)
        response = requests.post(search_url,
                                 verify=SSL_VERIFY,
                                 params={
                                     'id': resource_id,
                                     'limit': 0
                                 },
                                 headers={
                                     'Content-Type': 'application/json',
                                     'Authorization': api_key
                                 })
        if response.status_code == 404:
            return False
        elif response.status_code == 200:
            return response.json().get('result', {'fields': []})
        else:
            raise HTTPError(
                'Error getting datastore resource.',
                response.status_code,
                search_url,
                response,
            )
    except requests.exceptions.RequestException as e:
        raise util.JobError(
            'Error getting datastore resource ({!s}).'.format(e))
Exemple #8
0
def datastore_resource_exists(resource_id, api_key, ckan_url):
    try:
        search_url = get_url('datastore_search', ckan_url)
        response = requests.post(search_url,
                                 params={'id': resource_id,
                                         'limit': 0},
                                 headers={'Content-Type': 'application/json',
                                          'Authorization': api_key}
                                 )
        if response.status_code == 404:
            return False
        elif response.status_code == 200:
            return True
        else:
            raise util.JobError('Error getting datastore resource.')
    except requests.exceptions.RequestException:
        raise util.JobError('Error getting datastore resource.')
def echo_raw(task_id, input_):
    if input_['data'].startswith('>'):
        raise util.JobError('Do not start message with >')

    def raw():
        for x in sorted(input_['data']):
            yield x

    return raw
def datastore_resource_exists(resource_id, api_key, ckan_url):
    try:
        search_url = get_url('datastore_search', ckan_url)
        response = requests.post(search_url,
                                 params={'id': resource_id,
                                         'limit': 0},
                                 headers={'Content-Type': 'application/x-www-form-urlencoded',
                                          'Authorization': api_key}
                                 )
        if response.status_code == 404:
            logging.debug('Resource not found in db, creating')
            return False
        elif response.status_code == 200:
            logging.debug('Resource exists in db')
            return True
        else:
            raise util.JobError('Error getting datastore resource.')
    except requests.exceptions.RequestException:
        raise util.JobError('Error getting datastore resource.')
Exemple #11
0
    def download_file(resource, file_format):
        tmpname = None
        if 'SHP' == file_format:
            tmpname = '{0}.{1}'.format(uuid.uuid1(), 'shp.zip')
        elif 'KML' == file_format:
            tmpname = '{0}.{1}'.format(uuid.uuid1(), 'kml')
        elif 'KMZ' == file_format:
            tmpname = '{0}.{1}'.format(uuid.uuid1(), 'kml.zip')
        elif 'GRID' == file_format:
            tmpname = '{0}.{1}'.format(uuid.uuid1(), 'zip')

        if tmpname is None:
            raise util.JobError("Failed to recognize file format extension {0}".format(file_format))

        logger.info('Fetching from: {0}'.format(resource.get('url')))

        try:
            request = urllib2.Request(resource.get('url'))

            if resource.get('url_type') == 'upload':
                request.add_header('Authorization', data['api_key'])

            response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT)
        except urllib2.HTTPError as e:
            raise HTTPError(
                "SpatialIngestor received a bad HTTP response when trying to download "
                "the data file", status_code=e.code,
                request_url=resource.get('url'), response=e.read())

        except urllib2.URLError as e:
            if isinstance(e.reason, socket.timeout):
                raise util.JobError('Connection timed out after %ss' %
                                    DOWNLOAD_TIMEOUT)
            else:
                raise HTTPError(
                    message=str(e.reason), status_code=None,
                    request_url=resource.get('url'), response=None)

        try:
            with open(os.path.join(tempdir, tmpname), 'wb') as out_file:
                out_file.write(response.read())
        except Exception, e:
            raise util.JobError("Failed to copy file to {0} with exception {1}".format(os.path.join(tempdir, tmpname), str(e)))
Exemple #12
0
def delete_datastore_resource(resource_id, api_key, ckan_url):
    try:
        delete_url = get_url('datastore_delete', ckan_url)
        response = requests.post(delete_url,
                                 data=json.dumps({'id': resource_id,
                                                  'force': True}),
                                 headers={'Content-Type': 'application/json',
                                          'Authorization': api_key}
                                 )
        check_response(response, delete_url, 'CKAN',
                       good_status=(201, 200, 404), ignore_no_success=True)
    except requests.exceptions.RequestException:
        raise util.JobError('Deleting existing datastore failed.')
Exemple #13
0
def get_spatial_input_format(resource):
    check_string = resource.get('__extras', {}).get('format', resource.get('format', resource.get('url', ''))).upper()

    if any([check_string.endswith(x) for x in ["SHP", "SHAPEFILE"]]):
        return 'SHP'
    elif check_string.endswith("KML"):
        return 'KML'
    elif check_string.endswith("KMZ"):
        return 'KMZ'
    elif check_string.endswith("GRID"):
        return 'GRID'
    else:
        raise util.JobError("Failed to determine spatial file type for {0}".format(resource.get('url', '')))
Exemple #14
0
def validate_input(input):
    # Especially validate metdata which is provided by the user
    if not 'metadata' in input:
        raise util.JobError('Metadata missing')
    if not 'api_key' in input:
        raise util.JobError('CKAN API key missing')

    required_metadata_keys = {
        'resource_id',
        'ckan_url',
        'postgis',
        'geoserver',
        'geoserver_public_url',
        'target_spatial_formats'
    }

    missing_metadata_keys = required_metadata_keys - set(input['metadata'].keys())

    if missing_metadata_keys:
        raise util.JobError('Missing metadata keys: {0}'.format(missing_metadata_keys))

    required_db_metadata_keys = {
        'db_host',
        'db_name',
        'db_user',
        'db_pass'
    }

    missing_db_metadata_keys = required_db_metadata_keys - set(input['metadata']['postgis'].keys())

    if missing_db_metadata_keys:
        raise util.JobError('Missing DB metadata keys: {0}'.format(missing_db_metadata_keys))

    required_geoserver_metadata_keys = required_db_metadata_keys

    missing_geoserver_metadata_keys = required_geoserver_metadata_keys - set(input['metadata']['geoserver'].keys())

    if missing_geoserver_metadata_keys:
        raise util.JobError('Missing Geoserver metadata keys: {0}'.format(missing_geoserver_metadata_keys))
Exemple #15
0
def check_response(response,
                   request_url,
                   who,
                   good_status=(201, 200),
                   ignore_no_success=False):
    """
    Checks the response and raises exceptions if something went terribly wrong

    :param who: A short name that indicated where the error occurred
                (for example "CKAN")
    :param good_status: Status codes that should not raise an exception

    """
    if not response.status_code:
        raise util.JobError(
            '{who} bad response with no status code at: {url}'.format(
                who=who, url=request_url))

    message = '{who} bad response. Status code: {code} {reason}. At: {url}. Response: {resp}'
    try:
        if not response.status_code in good_status:
            json_response = response.json()
            if not ignore_no_success or json_response.get('success'):
                raise util.JobError(
                    message.format(who=who,
                                   code=response.status_code,
                                   reason=response.reason,
                                   url=request_url,
                                   resp=pprint.pformat(json_response)))
    except ValueError:
        raise util.JobError(
            message.format(who=who,
                           code=response.status_code,
                           reason=response.reason,
                           url=request_url,
                           resp=response.text[:200]))
Exemple #16
0
def get_db_cursor(data):
    db_port = None
    if data['postgis'].get('db_port', '') != '':
        db_port = data['postgis']['db_port']

    try:
        connection = psycopg2.connect(dbname=data['postgis']['db_name'],
                                      user=data['postgis']['db_user'],
                                      password=data['postgis']['db_pass'],
                                      host=data['postgis']['db_host'],
                                      port=db_port)
        connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        return connection.cursor(), connection
    except Exception, e:
        raise util.JobError("Failed to connect with PostGIS with error {0}".format(str(e)))
Exemple #17
0
def ckan_action(action, ckan_url, api_key, payload):
    url = get_url(action, ckan_url)
    try:
        r = requests.post(
            url,
            verify=SSL_VERIFY,
            data=json.dumps(payload),
            headers={
                "Content-Type": "application/json",
                "Authorization": api_key
            },
        )
        r.raise_for_status()
    except RequestException as e:
        raise util.JobError(f"{str(e)} with payload {json.dumps(payload)}")

    return r.json()["result"]
Exemple #18
0
def push_to_datastore(task_id, input, dry_run=False):
    '''Download and parse a resource push its data into CKAN's DataStore.

    An asynchronous job that gets a resource from CKAN, downloads the
    resource's data file and, if the data file has changed since last time,
    parses the data and posts it into CKAN's DataStore.

    :param dry_run: Fetch and parse the data file but don't actually post the
        data to the DataStore, instead return the data headers and rows that
        would have been posted.
    :type dry_run: boolean

    '''
    handler = util.StoringHandler(task_id, input)
    logger = logging.getLogger(task_id)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource = get_resource(resource_id, ckan_url, api_key)
    except util.JobError as e:
        # try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise util.JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    headers = {}
    if resource.get('url_type') == 'upload':
        # If this is an uploaded file to CKAN, authenticate the request,
        # otherwise we won't get file from private resources
        headers['Authorization'] = api_key
    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
            stream=True,  # just gets the headers for now
        )
        response.raise_for_status()

        cl = response.headers.get('content-length')
        try:
            if cl and int(cl) > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to download: {cl} > max ({max_cl}).'
                    .format(cl=cl, max_cl=MAX_CONTENT_LENGTH))
        except ValueError:
            pass

        tmp = tempfile.TemporaryFile()
        length = 0
        m = hashlib.md5()
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise util.JobError(
                    'Resource too large to process: {cl} > max ({max_cl}).'
                    .format(cl=length, max_cl=MAX_CONTENT_LENGTH))
            tmp.write(chunk)
            m.update(chunk)

        ct = response.headers.get('content-type', '').split(';', 1)[0]

    except requests.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file", status_code=e.response.status_code,
            request_url=url, response=e.response.content)
    except requests.RequestException as e:
        raise HTTPError(
            message=str(e), status_code=None,
            request_url=url, response=None)

    file_hash = m.hexdigest()
    tmp.seek(0)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info("The file hash hasn't changed: {hash}.".format(
            hash=file_hash))
        return

    resource['hash'] = file_hash

    try:
        table_set = messytables.any_tableset(tmp, mimetype=ct, extension=ct)
    except messytables.ReadError as e:
        # try again with format
        tmp.seek(0)
        try:
            format = resource.get('format')
            table_set = messytables.any_tableset(tmp, mimetype=format, extension=format)
        except:
            raise util.JobError(e)

    get_row_set = web.app.config.get('GET_ROW_SET',
                                     lambda table_set: table_set.tables.pop())
    row_set = get_row_set(table_set)
    offset, headers = messytables.headers_guess(row_set.sample)

    existing = datastore_resource_exists(resource_id, api_key, ckan_url)
    existing_info = None
    if existing:
        existing_info = dict((f['id'], f['info'])
            for f in existing.get('fields', []) if 'info' in f)

    # Some headers might have been converted from strings to floats and such.
    headers = [str(header) for header in headers]

    row_set.register_processor(messytables.headers_processor(headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    types = messytables.type_guess(row_set.sample, types=TYPES, strict=True)

    # override with types user requested
    if existing_info:
        types = [{
            'text': messytables.StringType(),
            'numeric': messytables.DecimalType(),
            'timestamp': messytables.DateUtilType(),
            }.get(existing_info.get(h, {}).get('type_override'), t)
            for t, h in zip(types, headers)]

    row_set.register_processor(messytables.types_processor(types))

    headers = [header.strip() for header in headers if header.strip()]
    headers_set = set(headers)

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                column_name = cell.column.strip()
                if column_name not in headers_set:
                    continue
                if isinstance(cell.value, str):
                    try:
                        data_row[column_name] = cell.value.encode('latin-1').decode('utf-8')
                    except (UnicodeDecodeError, UnicodeEncodeError):
                        data_row[column_name] = cell.value
                else:
                    data_row[column_name] = cell.value
            yield data_row
    result = row_iterator()

    '''
    Delete existing datstore resource before proceeding. Otherwise
    'datastore_create' will append to the existing datastore. And if
    the fields have significantly changed, it may also fail.
    '''
    if existing:
        logger.info('Deleting "{res_id}" from datastore.'.format(
            res_id=resource_id))
        delete_datastore_resource(resource_id, api_key, ckan_url)

    headers_dicts = [dict(id=field[0], type=TYPE_MAPPING[str(field[1])])
                     for field in zip(headers, types)]

    # Maintain data dictionaries from matching column names
    if existing_info:
        for h in headers_dicts:
            if h['id'] in existing_info:
                h['info'] = existing_info[h['id']]
                # create columns with types user requested
                type_override = existing_info[h['id']].get('type_override')
                if type_override in list(_TYPE_MAPPING.values()):
                    h['type'] = type_override

    logger.info('Determined headers and types: {headers}'.format(
        headers=headers_dicts))

    if dry_run:
        return headers_dicts, result

    count = 0
    for i, chunk in enumerate(chunky(result, 250)):
        records, is_it_the_last_chunk = chunk
        count += len(records)
        logger.info('Saving chunk {number} {is_last}'.format(
            number=i, is_last='(last)' if is_it_the_last_chunk else ''))
        send_resource_to_datastore(resource, headers_dicts, records,
                                   is_it_the_last_chunk, api_key, ckan_url)

    logger.info('Successfully pushed {n} entries to "{res_id}".'.format(
        n=count, res_id=resource_id))

    if data.get('set_url_type', False):
        update_resource(resource, api_key, ckan_url)
def failing(task_id, input_):
    time.sleep(0.1)
    raise util.JobError('failed')
def example(task_id, input_):
    if 'time' not in input_['data']:
        raise util.JobError('time not in input')

    time.sleep(input_['data']['time'])
    return 'Slept for ' + str(input_['data']['time']) + ' seconds.'
def echo(task_id, input_):
    if input_['data'].startswith('>'):
        raise util.JobError('Do not start message with >')
    if input_['data'].startswith('#'):
        raise Exception('Something went totally wrong')
    return '>' + input_['data']
Exemple #22
0
    tempdir = tempfile.mkdtemp()

    try:

        native_crs = "EPSG:4326"
        unzip_dir = None

        base_filepath = download_file(parent_resource, input_format)

        # Do we need to unzip?
        if input_format in ["KMZ", "SHP", "GRID"]:
            try:
                zpf = zipfile.ZipFile(base_filepath)
                unzip_dir = unzip_file(zpf, base_filepath)
            except:
                raise util.JobError("{0} is not a valid zip file".format(base_filepath))

            # Flatten the zip file
            for root, dirs, files in os.walk(unzip_dir):
                for sub_dir in dirs:
                    from_dir = os.path.join(root, sub_dir)
                    for f in getfiles(from_dir):
                        filename = f.split('/')[-1]
                        if os.path.isfile(os.path.join(unzip_dir, filename)):
                            filename = f.replace(from_dir, "", 1).replace("/", "_")
                        shutil.copy(f, os.path.join(unzip_dir, filename))
                    shutil.rmtree(from_dir)

            for f in os.listdir(unzip_dir):
                if f.lower().endswith(".kml"):
                    kml_file = os.path.join(unzip_dir, f)
Exemple #23
0
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            request.add_header('Authorization', api_key)

        response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT)
    except urllib2.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file",
            status_code=e.code,
            request_url=resource.get('url'),
            response=e.read())
    except urllib2.URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise util.JobError('Connection timed out after %ss' %
                                DOWNLOAD_TIMEOUT)
        else:
            raise HTTPError(message=str(e.reason),
                            status_code=None,
                            request_url=resource.get('url'),
                            response=None)

    cl = response.info().getheader('content-length')
    if cl and int(cl) > MAX_CONTENT_LENGTH:
        raise util.JobError(
            'Resource too large to download: {cl} > max ({max_cl}).'.format(
                cl=cl, max_cl=MAX_CONTENT_LENGTH))

    ct = response.info().getheader('content-type').split(';', 1)[0]

    file_content = response.read()
Exemple #24
0
        resource = get_resource(resource_id, ckan_url, api_key)
    except util.JobError, e:
        #try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlparse.urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise util.JobError(
            'Only http, https, and ftp resources may be fetched.')

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    headers = {}
    if resource.get('url_type') == 'upload':
        # If this is an uploaded file to CKAN, authenticate the request,
        # otherwise we won't get file from private resources
        headers['Authorization'] = api_key
    try:
        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
            stream=True,  # just gets the headers for now
Exemple #25
0
        #try again in 5 seconds just incase CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Dump files are managed with the Datastore API')
        return

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(resource.get('url')))
    try:
        request = urllib2.Request(resource.get('url'))

        if request.get_type().lower() not in ('http', 'https', 'ftp'):
            raise util.JobError(
                'Only http, https, and ftp resources may be fetched.')

        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            request.add_header('Authorization', api_key)

        response = urllib2.urlopen(request, timeout=DOWNLOAD_TIMEOUT)
    except urllib2.HTTPError as e:
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file",
            status_code=e.code,
            request_url=resource.get('url'),
            response=e.read())
    except urllib2.URLError as e:
Exemple #26
0
def echo(task_id, input):
    if input['data'].startswith('>'):
        raise util.JobError('do not start message with >')
    if input['data'].startswith('#'):
        raise Exception('serious exception')
    return '>' + input['data']
Exemple #27
0
def spatial_ingest(task_id, input):
    handler = util.StoringHandler(task_id, input)
    logger = logging.getLogger(task_id)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']
    data['api_key'] = input['api_key']

    logger.info('Retrieving resource information')

    resource = ckan_command('resource_show', {'id': data['resource_id']}, data)

    logger.info('Retrieving package information')

    package = ckan_command('package_show', {'id': resource['package_id']}, data)

    logger.info('Purging any legacy spatial ingestor assets')

    # Make sure there are no legacy resources or artifacts
    purge_legacy_spatial(data, package, logger)

    # Get package data again in case another thread deleted some legacy resources
    package = ckan_command('package_show', {'id': resource['package_id']}, data)

    # We have an ingestible resource that has been updated, passing all blacklist checks
    # and we have potential resources for creation.
    logger.info('Setting up PostGIS table for spatial assets')

    table_name = setup_spatial_table(data, resource)

    # Determine input format
    logger.info('Determining input format for resource')

    input_format = get_spatial_input_format(resource)

    # Ingest into DB and exit if this fails for whatever reason
    logger.info('Ingesting spatial asset into PostGIS DB')

    native_crs = db_upload(data, resource, input_format, table_name, logger)

    # Create Geoserver assets for PostGIS table
    logger.info('Creating Geoserver assets for PostGIS table')

    workspace, layer, bbox_obj = geoserver_transfer(data, package, input_format, native_crs, table_name, logger)

    # Figure out if any target formats are available to be expanded into.
    # I.e. if a resource of a target format already exists and is _not_
    # last modified by the spatial ingestor user, we do not added/update the
    # resource for that format.
    expansion_formats = get_spatial_upload_formats(data, package, input_format)
    if not expansion_formats:
        raise util.JobError("Package {0} has no available formats to expand into".format(package['name']))

    logger.info("Creating CKAN resources for new Geoserver assets")

    num_update = create_or_update_resources(data, package, resource, bbox_obj, expansion_formats, layer, workspace,
                                            logger)

    logger.info("{0} resources successfully created/updated".format(num_update))