Example #1
0
def chinook_summaries(input_path, output_path='/pfs/out'):
    dp = datapackage.DataPackage(
        descriptor=os.path.join(input_path, 'datapackage.json'))

    dp_out = datapackage.DataPackage()
    dp_out.descriptor['name'] = 'chinook-summary'
    dp_out.descriptor['title'] = 'chinook-summary'
    dp_out.descriptor['description'] = 'Summary Stats from Chinook DB'
    dp_out.descriptor['x-visibility'] = 'PRIVATE'
    dp_out.descriptor['licenses'] = [{'name': 'Other'}]
    dp_out.descriptor['resources'] = []
    for r in dp.descriptor['resources']:
        print('Processing {} with format {}'.format(r['path'], r['format']))
        if r.get('format', '') == 'csv':
            print('Attempting stats for {} with format {}'.format(
                r['path'], r['format']))
            df = pd.read_csv(os.path.join(dp.base_path, r['path']))
            try:
                stats = df.describe()
                plot_name = os.path.basename('{}.txt'.format(r['name']))
                with open(os.path.join(output_path, plot_name), 'w') as f:
                    f.write(str(stats))
                dp_out.descriptor['resources'].append({
                    'name': plot_name,
                    'path': plot_name
                })
                print('Done generating stats for {} with format {}'.format(
                    r['path'], r['format']))
            except:
                print('Failed to generate stats for {} with format {}'.format(
                    r['path'], r['format']))

    with open(os.path.join(output_path, 'datapackage.json'), 'w') as f:
        f.write(dp_out.to_json())
Example #2
0
def towns_and_counties():
    towns = datapackage.DataPackage(
        'https://raw.githubusercontent.com/CT-Data-Collaborative/ct-town-list/master/datapackage.json'
    )
    counties = datapackage.DataPackage(
        'https://raw.githubusercontent.com/CT-Data-Collaborative/ct-county-list/master/datapackage.json'
    )
    town_name_list = [t['Town'] for t in towns.resources[0].data]
    county_name_list = [c['County'] for c in counties.resources[0].data]
    return town_name_list + county_name_list
    def test_datapackage_only_requires_some_fields_to_be_valid(self):
        invalid_datapackage = datapackage.DataPackage({})
        valid_datapackage = datapackage.DataPackage({
            'name':
            'gdp',
            'resources': [{
                'name': 'the-resource',
                'path': 'http://example.com/some-data.csv'
            }]
        })

        converter.datapackage_to_dataset(valid_datapackage)
        nose.tools.assert_raises(KeyError, converter.datapackage_to_dataset,
                                 invalid_datapackage)
    def handle_datapackage(self, datapackage, parameters, stats):
        '''Create or update a ckan dataset from datapackage and parameters'''

        # core dataset properties
        dataset = {
            'title': '',
            'version': '',
            'state': 'active',
            'url': '',
            'notes': '',
            'license_id': '',
            'author': '',
            'author_email': '',
            'maintainer': '',
            'maintainer_email': '',
            'owner_org': None,
            'private': False
        }

        dp = datapackage_lib.DataPackage(datapackage)
        dataset.update(converter.datapackage_to_dataset(dp))

        self.__dataset_resources = dataset.get('resources', [])
        if self.__dataset_resources:
            del dataset['resources']

        # Merge dataset-properties from parameters into dataset.
        dataset_props_from_params = parameters.get('dataset-properties')
        if dataset_props_from_params:
            dataset.update(dataset_props_from_params)

        package_create_url = '{}/package_create'.format(self.__base_endpoint)

        response = make_ckan_request(package_create_url,
                                     method='POST',
                                     json=dataset,
                                     api_key=self.__ckan_api_key)

        ckan_error = get_ckan_error(response)
        if ckan_error \
           and parameters.get('overwrite_existing') \
           and 'That URL is already in use.' in ckan_error.get('name', []):

            package_update_url = \
                '{}/package_update'.format(self.__base_endpoint)

            log.info('CKAN dataset with url already exists. '
                     'Attempting package_update.')
            response = make_ckan_request(package_update_url,
                                         method='POST',
                                         json=dataset,
                                         api_key=self.__ckan_api_key)
            ckan_error = get_ckan_error(response)

        if ckan_error:
            log.exception('CKAN returned an error: ' + json.dumps(ckan_error))
            raise Exception

        if response['success']:
            self.__dataset_id = response['result']['id']
Example #5
0
 def test_base_path_is_set_to_base_url_when_datapackage_is_in_url(self):
     base_url = 'http://someplace.com/data'
     url = '{base_url}/datapackage.json'.format(base_url=base_url)
     body = '{}'
     httpretty.register_uri(httpretty.GET, url, body=body)
     dp = datapackage.DataPackage(url)
     assert dp.base_path == base_url
Example #6
0
 def test_descriptor_apply_defaults_resource_tabular_schema(self):
     descriptor = {
         'resources': [{
             'name': 'name',
             'data': 'data',
             'profile': 'tabular-data-resource',
             'schema': {
                 'fields': [{'name': 'name'}],
             }
         }],
     }
     dp = datapackage.DataPackage(descriptor)
     assert descriptor == {
         'profile': 'data-package',
         'resources': [{
             'name': 'name',
             'data': 'data',
             'profile': 'tabular-data-resource',
             'encoding': 'utf-8',
             'schema': {
                 'fields': [{'name': 'name', 'type': 'string', 'format': 'default'}],
                 'missingValues': [''],
             }
         }],
     }
Example #7
0
 def test_attributes_can_be_set(self):
     descriptor = {
         'profile': 'data-package',
     }
     dp = datapackage.DataPackage(descriptor)
     dp.descriptor['title'] = 'bar'
     assert dp.to_dict() == {'profile': 'data-package', 'title': 'bar'}
Example #8
0
    def setUp(self):
        # GIVEN datapackage with one resource
        invalid_dp = datapackage.DataPackage({
            "name": "some-datapackage",
            "resources": [
                {
                    "path": "invalid.csv",
                    "schema": {
                        "fields": [
                            {
                                "name": "Price", "type": "number"
                            },
                            {
                                # Column 2 name should be "Year"
                                "name": "Year", "type": "date", "format": "%Y"
                            }
                        ]
                    }
                }
            ]
        })
        patch('dpm.main.exists', lambda *a: True).start()
        patch('dpm.main.DataPackage', lambda *a: invalid_dp).start()

        # AND the resource file has invalid schema (column 2 name should be 'Year')
        with open('invalid.csv', 'w') as f:
            f.write(
                'Price,Ugh\n'  # 'Ugh' != 'Year'
                '1,1980'
            )
 def test_attributes_arent_immutable(self):
     metadata = {
         'keywords': [],
     }
     dp = datapackage.DataPackage(metadata)
     dp.metadata['keywords'].append('foo')
     assert dp.to_dict() == {'keywords': ['foo']}
    def setUp(self):
        # GIVEN datapackage that can be treated as valid by the dpm
        self.valid_dp = datapackage.DataPackage(
            {
                "name":
                "some-datapackage",
                "resources": [{
                    "name": "some-resource",
                    "path": "./data/some_data.csv",
                }]
            },
            default_base_path='.')
        patch('dpm.client.DataPackage', lambda *a: self.valid_dp).start()
        patch('dpm.client.exists', lambda *a: True).start()

        # AND the registry server that accepts any user
        responses.add(responses.POST,
                      'https://example.com/api/auth/token',
                      json={'token': 'blabla'},
                      status=200)
        # AND registry server accepts deletion of any datapackage
        responses.add(responses.DELETE,
                      'https://example.com/api/package/user/some-datapackage',
                      json={'message': 'OK'},
                      status=200)
        # AND registry server accepts purging of any datapackage
        responses.add(
            responses.DELETE,
            'https://example.com/api/package/user/some-datapackage/purge',
            json={'message': 'OK'},
            status=200)
Example #11
0
def create(reader, out_dir, log_level=None):
    logger = get_logger("createdp.create", level=log_level)
    top_level_dict = reader.conf[config.TOP_LEVEL]
    try:
        name = top_level_dict["name"]
    except KeyError:
        raise ValueError("'name' is a required property")
    if not NAME_PATTERN.match(name):
        raise ValueError("invalid name: %r" % (name,))
    dp = datapackage.DataPackage()
    for k, v in top_level_dict.items():
        dp.descriptor[k] = v
    dp.descriptor['resources'] = []
    mkdir_p(out_dir)
    logger.info("writing to '%s'", out_dir)
    for a in "objects", "links":
        out_bn = "%s.csv" % a
        out_fn = os.path.join(out_dir, out_bn)
        df = getattr(reader, a)
        df.to_csv(out_fn, index=False, quoting=csv.QUOTE_NONE)
        if a == "objects":
            name = cmso.OBJECTS_TABLE
            infer_kwargs = {"primary_key": cmso.OBJECT_ID}
        else:
            name = cmso.LINKS_TABLE
            infer_kwargs = {}
        schema = infer_from_df(df, **infer_kwargs)
        if a == "links":
            schema['foreignKeys'] = FOREIGN_KEYS
        res = {"name": name, "path": out_bn, "schema": schema}
        dp.descriptor['resources'].append(res)
    with open(os.path.join(out_dir, 'dp.json'), 'w') as f:
        f.write(to_json(dp) + '\n')
    return dp
Example #12
0
def load_datapackage_file(datapackage_path):
    dp = datapackage.DataPackage(datapackage_path)
    dpdict = dp.descriptor

    # First we need to see what sort of author object we have. CTData only publishes one author per dataset, but
    # we want to support valid datapackage.json files which allow for an array of them
    try:
        author_name = dpdict['author']['name']
        author = dpdict['author']
    except TypeError as e:
        author = dpdict['author'][0]

    try:
        upload_object = {'name': dpdict['name'], 'title': dpdict['title'], 'maintainer': author['name'],
                'maintainer_email': author['email'], 'owner_org': dpdict['sources'][0]['name']}
    except KeyError as e:
        raise e
    try:
        dp.validate()
    except datapackage.exceptions.ValidationError as e: 
        if e.instance == dpdict['author']:
            pass
        else:
            raise e
    try:
        upload_object['extras'] = get_extras_object(dpdict)
    except (KeyError, Exception) as e:
        raise e
    return dpdict, upload_object
 def test_attributes_arent_immutable(self):
     descriptor = {
         'keywords': [],
     }
     dp = datapackage.DataPackage(descriptor)
     dp.descriptor['keywords'].append('foo')
     assert dp.to_dict() == {'keywords': ['foo']}
Example #14
0
    def initialize_datapackage(self, config):
        """Create a datapackage or return the existing one along with it's path"""

        datapkg_file_path = config.get('datapackage_file', '')
        if not datapkg_file_path or not os.path.isabs(datapkg_file_path):
            datapkg_file_path = os.path.join(self.workspace_path,
                                             'datapackage.json')

        datapkg_file_path = os.path.abspath(datapkg_file_path)
        if not os.path.exists(datapkg_file_path):
            with io.open(datapkg_file_path, mode='w+',
                         encoding='utf-8') as new_datapkg:
                default_datapkg = utilities.get_default_datapackage()
                for resource in default_datapkg.resources:
                    resource_path = config.get(resource.descriptor['name'],
                                               resource.descriptor['path'])
                    resource.descriptor['path'] = os.path.join(
                        config['data_dir'], resource_path)
                json_datapkg = json.dumps(default_datapkg.to_dict(), indent=4)
                new_datapkg.write(compat.str(json_datapkg))
                print(
                    ('A new "datapackage.json" file has been created at {0}. '
                     'Please review and update it.'.format(datapkg_file_path)))
                return default_datapkg
        else:
            datapackage_check = DataPackageChecker(config)
            datapackage_check.run()
            return datapackage.DataPackage(datapkg_file_path)
Example #15
0
 def validate(self, dp):
     if isinstance(dp, datapackage.DataPackage) and not is_tabular(dp):
         raise ValueError("data package must be a tabular data package")
     else:
         dp = datapackage.DataPackage(dp, schema="tabular")
     dp.validate()
     self.logger.debug("valid tabular data package")
     if len(dp.resources) < 2:
         self.__error("data package must have at least two resources")
     res_map = dict((_.descriptor['name'], _) for _ in dp.resources)
     try:
         objects = res_map[cmso.OBJECTS_TABLE]
     except KeyError:
         self.__error("objects table not found")
     else:
         self.validate_objects(objects.descriptor)
     try:
         links = res_map[cmso.LINKS_TABLE]
     except KeyError:
         self.__error("links table not found")
     else:
         self.validate_links(links.descriptor)
     try:
         tracks = res_map[cmso.TRACKS_TABLE]
     except KeyError:
         pass
     else:
         self.validate_tracks(tracks.descriptor)
     return dp
Example #16
0
    def test_should_raise_if_zipfile_raised_LargeZipFile(
            self, zipfile_mock, tmpfile):
        zipfile_mock.side_effect = zipfile.LargeZipFile()
        dp = datapackage.DataPackage({}, {})

        with pytest.raises(datapackage.exceptions.DataPackageException):
            dp.save(tmpfile)
Example #17
0
 def test_attributes_can_be_set(self):
     metadata = {
         'name': 'foo',
     }
     dp = datapackage.DataPackage(metadata)
     dp.metadata['title'] = 'bar'
     assert dp.to_dict() == {'name': 'foo', 'title': 'bar'}
Example #18
0
 def __init__(self, config, **kwargs):
     self.config = config
     self.remotes = self.config['remotes']
     self.branch = self.config['branch']
     self.data_dir = self.config['data_dir']
     self.result_file = os.path.join(self.data_dir,
                                     self.config['result_file'])
     self.run_file = os.path.join(self.data_dir, self.config['run_file'])
     self.source_file = os.path.join(self.data_dir,
                                     self.config['source_file'])
     self.performance_file = os.path.join(self.data_dir,
                                          self.config['performance_file'])
     self.publisher_file = os.path.join(self.data_dir,
                                        self.config['publisher_file'])
     self.cache_dir = self.config['cache_dir']
     self.data_key = self.config['goodtables']['arguments']['batch'][
         'data_key']
     datapkg_file_path = self.config.get('datapackage_file',
                                         'datapackage.json')
     if not os.path.isabs(datapkg_file_path):
         datapkg_file_path = os.path.join(os.path.dirname(self.data_dir),
                                          datapkg_file_path)
     try:
         self.datapackage = datapackage.DataPackage(datapkg_file_path)
     except datapackage.exceptions.DataPackageException as e:
         raise ValueError(
             ('A datapackage couldn\'t be created because of the '
              'following error: "{0}". Make sure the file is not '
              'empty and use "dq init" command.').format(e))
     self.all_scores = []
Example #19
0
 def test_init_accepts_filelike_object(self):
     metadata = {
         'foo': 'bar',
     }
     filelike_metadata = six.StringIO(json.dumps(metadata))
     dp = datapackage.DataPackage(filelike_metadata)
     assert dp.metadata == metadata
Example #20
0
def validate(resource=None):

    datapackage.validate('datapackage/gtex-v8-datapackage.json')
    gtex_package = datapackage.DataPackage(
        'datapackage/gtex-v8-datapackage.json', strict=True)
    if resource != 'all':
        r = gtex_package.get_resource(resource)
        print(r.name)
        try:
            t = r.read()
        except CastError as ce:
            print('Hit cast error')
            for err in ce.errors:
                print(err)
            print(ce)
        except Exception as inst:
            print('Hit generic exception')
            print(type(inst))
            print(inst.args)
            print(inst)
    else:
        for r in gtex_package.resources:
            print(r.name)
            try:
                t = r.read()
            except CastError as ce:
                print('Hit cast error')
                print(ce.errors)
                print(ce)
            except Exception as inst:
                print('Hit generic exception')
                print(type(inst))
                print(inst.args)
                print(inst)
Example #21
0
    def test_schema_gets_from_registry_if_available(self, registry_class_mock):
        schema = {'foo': 'bar'}
        registry_mock = mock.MagicMock()
        registry_mock.get.return_value = schema
        registry_class_mock.return_value = registry_mock

        assert datapackage.DataPackage().schema.to_dict() == schema
Example #22
0
    def __init__(self, descriptor_file):

        self._datapackage = datapackage.DataPackage(descriptor_file)

        self.__descriptor_file = descriptor_file
        self.__base_path = os.path.dirname(
            os.path.abspath(self.__descriptor_file))

        # Index resources by name
        self.__resources = {r.descriptor['name']: r
                            for r in self._datapackage.resources}
        self.__tabular_resources = {k: sanitize_resource_schema(r)
                                    for (k, r) in self.__resources.items()
                                    if type(r) is TabularResource and
                                    r.descriptor['path'].startswith('data')}
        self.__invalid_schemas = []  # Resource names with invalid schemas

        # All formats
        self.raw_data = LazyLoadedDict.from_keys(
            self.__resources.keys(),
            self._load_raw_data,
            'bytes')

        # Tabular formats
        self.tables = LazyLoadedDict.from_keys(
            self.__tabular_resources.keys(),
            self._load_table,
            type_hint='list of rows')
        self.dataframes = LazyLoadedDict.from_keys(
            self.__tabular_resources.keys(),
            self._load_dataframe,
            type_hint='pandas.DataFrame')
Example #23
0
 def test_init_accepts_filelike_object(self):
     descriptor = {
         'profile': 'data-package',
     }
     filelike_descriptor = six.StringIO(json.dumps(descriptor))
     dp = datapackage.DataPackage(filelike_descriptor)
     assert dp.descriptor == descriptor
Example #24
0
 def test_open_resource_url(self, mocklib_urlopen):
     dpkg = datapackage.DataPackage("tests/test.dpkg_url/")
     list(
         dpkg.data
     )  # Force the iteration over the iterable returned from data property.
     mocklib_urlopen.assert_called_once_with(
         'http://example.com/country-codes.csv')
Example #25
0
 def test_descriptor_apply_defaults_resource_tabular_dialect(self):
     descriptor = {
         'resources': [{
             'name': 'name',
             'data': 'data',
             'profile': 'tabular-data-resource',
             'dialect': {
                 'delimiter': 'custom',
             }
         }],
     }
     dp = datapackage.DataPackage(descriptor)
     assert descriptor == {
         'profile': 'data-package',
         'resources': [{
             'name': 'name',
             'data': 'data',
             'profile': 'tabular-data-resource',
             'encoding': 'utf-8',
             'dialect': {
                 'delimiter': 'custom',
                 'doubleQuote': True,
                 'lineTerminator': '\r\n',
                 'quoteChar': '"',
                 'escapeChar': '\\',
                 'skipInitialSpace': True,
                 'header': True,
                 'caseSensitiveHeader': False,
             }
         }],
     }
Example #26
0
 def test_open_resource_local(self):
     dpkg = datapackage.DataPackage("tests/test.dpkg_local/")
     with mocklib.patch('io.open') as mocklib_open:
         list(
             dpkg.data
         )  # Force the iteration over the iterable returned from data property.
         mocklib_open.assert_called_once()
Example #27
0
 def test_init_raises_if_path_is_a_bad_json(self):
     bad_json = test_helpers.fixture_path('bad_json.json')
     with pytest.raises(datapackage.exceptions.DataPackageException) as excinfo:
         datapackage.DataPackage(bad_json)
     message = str(excinfo.value)
     assert 'Unable to parse JSON' in message
     assert 'line 2 column 5 (char 6)' in message
Example #28
0
 def test_open_resource_encoding(self):
     dpkg = datapackage.DataPackage("tests/test.dpkg_local/")
     rows = list(
         dpkg.data
     )  # Force the iteration over the iterable returned from data property.
     # And make sure we were able to get some utf-8 data out of thereget
     assert 'Alg\xe9rie' == rows[2]['name_fr']
Example #29
0
    def __call__(self):
        url = self.parameters['url']
        dep_prefix = 'dependency://'
        if url.startswith(dep_prefix):
            dependency = url[len(dep_prefix):].strip()
            url = get_dependency_datapackage_url(dependency)
            assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
        resource = self.parameters['resource']
        stream = self.parameters.get('stream', True)
        name_matcher = ResourceMatcher(resource) if isinstance(resource,
                                                               str) else None
        resource_index = resource if isinstance(resource, int) else None

        selected_resources = []
        found = False
        dp = datapackage.DataPackage(url)
        dp = self.process_datapackage(dp)
        for i, orig_res in enumerate(dp.resources):
            if resource_index == i or \
                    (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
                found = True
                orig_res.descriptor[PROP_STREAMED_FROM] = orig_res.source
                self.dp['resources'].append(orig_res.descriptor)
                if tabular(orig_res.descriptor) and stream:
                    orig_res.descriptor[PROP_STREAMING] = True
                    selected_resources.append(orig_res.iter(keyed=True))
                else:
                    orig_res.descriptor[PROP_STREAMING] = False

        assert found, "Failed to find resource with index or name matching %r" % resource
        spew(self.dp, itertools.chain(self.res_iter, selected_resources))
Example #30
0
 def test_works_with_resources_with_relative_paths(self, tmpfile):
     path = test_helpers.fixture_path(
         'datapackage_with_foo.txt_resource.json')
     dp = datapackage.DataPackage(path)
     dp.save(tmpfile)
     with zipfile.ZipFile(tmpfile, 'r') as z:
         assert len(z.filelist) == 2