Ejemplo n.º 1
0
    def test_download_file(self, url):
        resource = self._test_resource(url)

        result = download(self.fake_context, resource)

        assert result["saved_file"]
        assert os.path.exists(result["saved_file"])
        _remove_archived_file(result.get("saved_file"))

        # Modify the resource and check that the resource size gets updated
        resource["url"] = url.replace("content=test", "content=test2")
        result = download(self.fake_context, resource)
        assert_equal(result["size"], len("test2"))

        _remove_archived_file(result.get("saved_file"))
Ejemplo n.º 2
0
    def test_download_file(self, url):
        resource = self._test_resource(url)

        result = download(self.fake_context, resource)

        assert result['saved_file']
        assert os.path.exists(result['saved_file'])
        _remove_archived_file(result.get('saved_file'))

        # Modify the resource and check that the resource size gets updated
        resource['url'] = url.replace('content=test', 'content=test2')
        result = download(self.fake_context, resource)
        assert_equal(result['size'], len('test2'))

        _remove_archived_file(result.get('saved_file'))
Ejemplo n.º 3
0
    def test_download_file(self, url):
        resource = self._test_resource(url)

        result = download(self.fake_context, resource)

        assert result['saved_file']
        assert os.path.exists(result['saved_file'])
        _remove_archived_file(result.get('saved_file'))

        # Modify the resource and check that the resource size gets updated
        resource['url'] = url.replace('content=test', 'content=test2')
        result = download(self.fake_context, resource)
        assert_equal(result['size'], len('test2'))

        _remove_archived_file(result.get('saved_file'))
Ejemplo n.º 4
0
    def test_download_file(self, client):
        url = client + '/?status=200&content=test&content-type=csv'
        resource = self._test_resource(url)

        result = download(self.fake_context, resource)

        assert result['saved_file']
        assert os.path.exists(result['saved_file'])
        _remove_archived_file(result.get('saved_file'))

        # Modify the resource and check that the resource size gets updated
        resource['url'] = url.replace('content=test', 'content=test2')
        result = download(self.fake_context, resource)
        assert result['size'] == len('test2')

        _remove_archived_file(result.get('saved_file'))
Ejemplo n.º 5
0
    def test_download_file(self, url):
        context = json.dumps(self.fake_context)
        resource = self.fake_resource
        resource['url'] = url

        result = download(self.fake_context, resource)

        assert result['saved_file']
        assert os.path.exists(result['saved_file'])
        self._remove_archived_file(result.get('saved_file'))

        # Modify the resource and check that the resource size gets updated
        resource['url'] = url.replace('content=test','content=test2')
        result = download(self.fake_context, resource)
        assert resource['size'] == unicode(len('test2')), resource['size']

        self._remove_archived_file(result.get('saved_file'))
Ejemplo n.º 6
0
    def test_download_file(self, url):
        context = json.dumps(self.fake_context)
        resource = self.fake_resource
        resource['url'] = url

        result = download(self.fake_context, resource)

        assert result['saved_file']
        assert os.path.exists(result['saved_file'])
        self._remove_archived_file(result.get('saved_file'))

        # Modify the resource and check that the resource size gets updated
        resource['url'] = url.replace('content=test', 'content=test2')
        result = download(self.fake_context, resource)
        assert resource['size'] == unicode(len('test2')), resource['size']

        self._remove_archived_file(result.get('saved_file'))
Ejemplo n.º 7
0
    def test_head_unsupported(self, url):
        # This test was more relevant when we did HEAD requests. Now servers
        # which respond badly to HEAD requests are not an issue.
        resource = self._test_resource(url)

        # HEAD request will return a 405 error, but it will persevere
        # and do a GET request which will work.
        result = download(self.fake_context, resource)
        assert result['saved_file']
Ejemplo n.º 8
0
    def test_head_unsupported(self, url):
        # This test was more relevant when we did HEAD requests. Now servers
        # which respond badly to HEAD requests are not an issue.
        resource = self._test_resource(url)

        # HEAD request will return a 405 error, but it will persevere
        # and do a GET request which will work.
        result = download(self.fake_context, resource)
        assert result['saved_file']
Ejemplo n.º 9
0
    def test_head_unsupported(self, client):
        url = client + '/?status=200&method=get&content=test&content-type=csv'
        # This test was more relevant when we did HEAD requests. Now servers
        # which respond badly to HEAD requests are not an issue.
        resource = self._test_resource(url)

        # HEAD request will return a 405 error, but it will persevere
        # and do a GET request which will work.
        result = download(self.fake_context, resource)
        assert result['saved_file']
Ejemplo n.º 10
0
def _datastorer_upload(context, resource, logger):
    from ckanext.archiver.tasks import ChooseNotToDownload
    from time import sleep

    max_retries = 5
    for i in range(max_retries):
        try:
            result = download(context, resource, data_formats=DATA_FORMATS)
        except ChooseNotToDownload:
            raise
        except Exception, e:
            if i < max_retries:
                logger.error("Error while performing download: %r. Retrying...", e)
                sleep(5 * i)
                continue
            else:
                raise
        else:
            break
Ejemplo n.º 11
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')
    table_sets = any_tableset(f,
                              mimetype=content_type,
                              extension=resource['format'].lower())

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(row_set.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateUtilType
    ],
                               strict=True)
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (
        ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {
            'resource_id':
            resource['id'],
            'fields': [
                dict(id=name, type=typename)
                for name, typename in zip(headers, guessed_type_names)
            ],
            'force':
            True,
            'records':
            data
        }
        response = requests.post(
            datastore_create_request_url,
            data=json.dumps(request),
            headers={
                'Content-Type': 'application/json',
                'Authorization': context['apikey']
            },
        )
        check_response_and_retry(response, datastore_create_request_url,
                                 logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info(
            'Deleting existing datastore (it may not exist): {0}.'.format(
                resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({
                                     'resource_id': resource['id'],
                                     'force': True
                                 }),
                                 headers={
                                     'Content-Type': 'application/json',
                                     'Authorization': context['apikey']
                                 })
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(
                get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(itertools.imap(dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(
        n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url':
        'active',
        'webstore_last_updated':
        datetime.datetime.now().isoformat()
    })

    response = requests.post(ckan_request_url,
                             data=json.dumps(resource),
                             headers={
                                 'Content-Type': 'application/json',
                                 'Authorization': context['apikey']
                             })

    if response.status_code not in (201, 200):
        raise DatastorerException(
            'Ckan bad response code (%s). Response was %s' %
            (response.status_code, response.content))
Ejemplo n.º 12
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)
    logger.info('Downloaded resource %r' %(resource))

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters
    
    extension = resource['format'].lower()
    
    fp = open(result['saved_file'], 'rb')
    if zipfile.is_zipfile(result['saved_file']):
        fp, zf = open_zipped_tableset(fp, extension=extension)
        logger.info('Opened entry %s from ZIP archive %s', zf, result['saved_file'])
    else:
        logger.info('Opened file %s' %(result['saved_file']))

    table_sets = any_tableset(fp, extension=extension)
    
    if 'sample_size' in context:
        table_sets.window = max(1000, int(context['sample_size']))
        logger.info('Using a sample window of %d', table_sets.window)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'force': True,
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                                 data=json.dumps({'resource_id': resource['id'], 'force': True}),
                        headers={'Content-Type': 'application/json',
                                'Authorization': context['apikey']}
                        )
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    })

    response = requests.post(
        ckan_request_url,
        data=json.dumps(resource),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))
Ejemplo n.º 13
0
def _datastorer_upload(context, resource):

    excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))


    ckan_url = context['site_url'].rstrip('/')
    
    webstore_request_url = '%s/api/data/%s/' % (ckan_url,
                                                resource['id']
                                                )

    def send_request(data):
        return requests.post(webstore_request_url + '_bulk',
                             data = "%s%s" % ("\n".join(data), "\n"),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )

    data = []
    for count,dict_ in enumerate(row_set.dicts()):
        data.append(json.dumps({"index": {"_id": count+1}}))
        data.append(json.dumps(dict_))
        if (count % 100) == 0:
            response = send_request(data)
            check_response_and_retry(response, webstore_request_url+'_mapping')
            data[:] = []

    if data:
        respose = send_request(data)
        check_response_and_retry(response, webstore_request_url+'_mapping')


    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url,
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Ejemplo n.º 14
0
def webstorer_upload(context, data):

    context = json.loads(context)
    resource = json.loads(data)
    
    excel_types = ['xls', 'application/ms-excel', 'application/xls']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))

    rows = []
    
    for row in row_set.dicts():
        rows.append(row)


    webstore_url = context.get('webstore_url').rstrip('/')
    
    webstore_request_url = '%s/%s/%s' % (webstore_url,
                                         context['username'],
                                         resource['id']
                                         )
    #check if resource is already there.
    webstore_response = requests.get(webstore_request_url+'.json')
    check_response_and_retry(webstore_response, webstore_request_url+'.json')

    #should be an empty list as no tables should be there.
    if json.loads(webstore_response.content):
        raise WebstorerError('Webstore already has this resource')

    response = requests.post(webstore_request_url+'/data',
                             data = json.dumps(rows),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )
    check_response_and_retry(response, webstore_request_url+'.json')
    if response.status_code != 201:
        raise WebstorerError('Websore bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )

    ckan_url = context['site_url'].rstrip('/')
    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url+'/data',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Ejemplo n.º 15
0
def _datastorer_upload(context, resource, logger):

    excel_types = ['xls', 'application/ms-excel', 'application/xls',
                   'application/vnd.ms-excel']
    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']

    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        is_tsv = (content_type in tsv_types or
                  resource['format'] in tsv_types)
        delimiter = '\t' if is_tsv else ','
        table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat(),
        'url': resource['url']
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))