Ejemplo n.º 1
0
def datapackage_creator(location, title, name, source_title, source_path):
    package = Package()

    package.descriptor['title'] = title
    package.descriptor['name'] = name

    package.descriptor['sources'] = [{}]
    package.descriptor['sources'][0]['title'] = source_title
    package.descriptor['sources'][0]['path'] = source_path

    package.descriptor['licences'] = [{}]
    package.descriptor['licences'][0]['name'] = 'odc-pddl'
    package.descriptor['licences'][0]['title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)'
    package.descriptor['licences'][0]['path'] = 'http://opendatacommons.org/licenses/pddl/'

    package.commit()
    package.infer(location + '/data/*.csv')
    package_json = package.descriptor
    del package_json['profile']

    for resource in package_json['resources']:
        resource['path'] = resource['path'][len(location) + 1:]

    if package.valid:
        with open(location + '/datapackage.json', 'w') as data_file:
            json.dump(package_json, data_file, indent=4, sort_keys=True)
        return True
    else:
        print('DATAPACKAGE IS NOT VALID')
        return False
Ejemplo n.º 2
0
def package_from_resources(resource_path, output_path, clean=True):
    """ Collects resource descriptors and merges them in a datapackage.json

    Parameters
    ----------
    resource_path: string
        Path to directory with resources (in .json format)
    output_path: string
        Root path of datapackage where the newly created datapckage.json is
        stored
    clean: boolean
        If true, resources will be deleted
    """
    p = Package()

    p.descriptor["profile"] = "tabular-data-package"
    p.commit()

    for f in os.listdir(resource_path):
        path = os.path.join(resource_path, f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    if clean:
        os.rmdir(resource_path)

    p.save(os.path.join(output_path, "datapackage.json"))
Ejemplo n.º 3
0
def test_changing_resources_in_descriptor_changes_datapackage():
    descriptor = {
        'resources': [
            {'data': '万事开头难'}
        ]
    }
    package = Package(descriptor)
    package.descriptor['resources'][0]['name'] = 'saying'
    package.commit()
    assert package.descriptor['resources'][0]['name'] == 'saying'
Ejemplo n.º 4
0
def test_changing_resources_in_descriptor_changes_datapackage():
    descriptor = {
        'resources': [
            {'data': '万事开头难'}
        ]
    }
    package = Package(descriptor)
    package.descriptor['resources'][0]['name'] = 'saying'
    package.commit()
    assert package.descriptor['resources'][0]['name'] == 'saying'
Ejemplo n.º 5
0
def _make_package(source, publisher, config):

    os.chdir(source)
    files = [f for f in os.listdir('data') if f.endswith('.csv')]
    package = Package({'publisher': publisher})

    for f in files:
        path = f"data/{f}"
        name = f.replace('.csv', '')
        schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json"
        resource = Resource({'path': path, 'schema': schema})
        package.add_resource(resource.descriptor)

    package.commit()
    package.infer()

    errors = False
    for r in package.resources:
        try:
            r.read(keyed=True)
            r.check_relations()
        except (CastError, RelationError) as e:
            print('Error in', os.path.join(source, r.descriptor['path']))
            print(e, e.errors)
            errors = True
    if not errors:
        package.save('datapackage.zip')
        print('saved datapackage.json to', source)

        s3 = boto3.client(
            's3',
            aws_access_key_id=config['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY'])

        bucket = 'developer-contributions-datapackages'
        key = f'{publisher}/{uuid.uuid4()}/datapackage.zip'
        s3.upload_file(f'{source}/datapackage.zip',
                       bucket,
                       key,
                       ExtraArgs={'ACL': 'public-read'})

        config = s3._client_config
        config.signature_version = botocore.UNSIGNED

        datapackage_url = boto3.resource(
            's3',
            config=config).meta.client.generate_presigned_url('get_object',
                                                              ExpiresIn=0,
                                                              Params={
                                                                  'Bucket':
                                                                  bucket,
                                                                  'Key': key
                                                              })

        return datapackage_url
Ejemplo n.º 6
0
def test_can_add_resource_to_descriptor_in_place():
    resource = {
        'data': '万事开头难',
    }
    package = Package()
    resources = package.descriptor.get('resources', [])
    resources.append(resource)
    package.descriptor['resources'] = resources
    package.commit()
    assert len(package.resources) == 1
    assert package.resources[0].source == '万事开头难'
Ejemplo n.º 7
0
def test_can_add_resource_to_descriptor_in_place():
    resource = {
        'data': '万事开头难',
    }
    package = Package()
    resources = package.descriptor.get('resources', [])
    resources.append(resource)
    package.descriptor['resources'] = resources
    package.commit()
    assert len(package.resources) == 1
    assert package.resources[0].source == '万事开头难'
Ejemplo n.º 8
0
 def process_datapackage(self, dp: Package):
     super().process_datapackage(dp)
     descriptor = dp.descriptor
     source: DataStream
     for source in self.sources:
         res1 = descriptor.pop('resources', [])
         res2 = source.dp.descriptor['resources']
         descriptor.update(source.dp.descriptor)
         descriptor['resources'] = res1 + res2
     dp.commit()
     return dp
Ejemplo n.º 9
0
def test_can_remove_resource_from_descriptor_in_place():
    descriptor = {
        'resources': [
            {'data': '万事开头难'},
            {'data': 'All beginnings are hard'}
        ]
    }
    package = Package(descriptor)
    del package.descriptor['resources'][1]
    package.commit()
    assert len(package.resources) == 1
    assert package.resources[0].source == '万事开头难'
Ejemplo n.º 10
0
def test_can_remove_resource_from_descriptor_in_place():
    descriptor = {
        'resources': [
            {'data': '万事开头难'},
            {'data': 'All beginnings are hard'}
        ]
    }
    package = Package(descriptor)
    del package.descriptor['resources'][1]
    package.commit()
    assert len(package.resources) == 1
    assert package.resources[0].source == '万事开头难'
Ejemplo n.º 11
0
 def convert_hdx_dataset(self, dataset_id, path):
     dataset = Dataset.read_from_hdx(dataset_id)
     package = Package({'id': dataset['id'], 'name': dataset['name'], 'title': dataset['title'],
                        'description': dataset['notes']})
     for hdx_resource in dataset.get_resources():
         name = hdx_resource['name'].lower().replace(' ', '_')
         package.add_resource({'name': name, 'path': hdx_resource['url'],
                               'format': hdx_resource['format'].lower(), 'title': hdx_resource['description']})
     try:
         package.infer()
     except tabulator.exceptions.FormatError:
         pass
     for frictionless_resource in package.descriptor['resources']:
         self.convert_hxl_url(frictionless_resource)
     package.commit()
     package.save(path)
Ejemplo n.º 12
0
def update_package_descriptor():
    """
    """
    p = Package("datapackage.json")

    for f in os.listdir("resources"):
        path = os.path.join("resources", f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    os.rmdir("resources")

    p.save("datapackage.json")
Ejemplo n.º 13
0
def datapackage_creator(location, title, name, source_title, source_path):
    package = Package()

    package.descriptor['title'] = title
    package.descriptor['name'] = name

    package.descriptor['sources'] = [{}]
    package.descriptor['sources'][0]['title'] = source_title
    package.descriptor['sources'][0]['path'] = source_path

    package.descriptor['licences'] = [{}]
    package.descriptor['licences'][0]['name'] = 'odc-pddl'
    package.descriptor['licences'][0]['title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)'
    package.descriptor['licences'][0]['path'] = 'http://opendatacommons.org/licenses/pddl/'

    package.commit()
    package.infer(location + '/data/*.csv')
    package_json = package.descriptor
    del package_json['profile']

    with open(location + '/datapackage.json', 'w') as data_file:
        json.dump(package_json, data_file, indent=4, sort_keys=True)
Ejemplo n.º 14
0
def datapackage_creator(location, title, name, source_title, source_path):
    package = Package()

    package.descriptor['title'] = title
    package.descriptor['name'] = name

    package.descriptor['sources'] = [{}]
    package.descriptor['sources'][0]['title'] = source_title
    package.descriptor['sources'][0]['path'] = source_path

    package.descriptor['licences'] = [{}]
    package.descriptor['licences'][0]['name'] = 'odc-pddl'
    package.descriptor['licences'][0][
        'title'] = 'Open Data Commons Public Domain Dedication and Licence (PDDL)'
    package.descriptor['licences'][0][
        'path'] = 'http://opendatacommons.org/licenses/pddl/'

    package.commit()
    package.infer(location + '/data/*.csv')
    package_json = package.descriptor
    del package_json['profile']

    with open(location + '/datapackage.json', 'w') as data_file:
        json.dump(package_json, data_file, indent=4, sort_keys=True)
Ejemplo n.º 15
0
def temporal_clustering(datapackage, n, path="/tmp", how="daily"):
    """ Creates a new datapackage by aggregating sequences inside the
    `sequence` folder of the specified datapackage by clustering `n` timesteps

    Parameters
    ----------
    datapackage: string
        String of meta data file datapackage.json
    n: integer
        Number of clusters
    path: string
        Path to directory where the aggregated datapackage is stored
    how: string
        How to cluster 'daily' or 'hourly'
    """
    if how == "weekly":
        raise NotImplementedError("Weekly clustering is not implemented!")

    p = Package(datapackage)

    cwd = os.getcwd()

    copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" +
                           how + "_" + str(n))

    copy_path = os.path.join(path, p.descriptor["name"], copied_package_name)

    copied_root = copy_datapackage(datapackage,
                                   os.path.abspath(copy_path),
                                   subset="data")

    sequence_resources = [
        r for r in p.resources
        if re.match(r"^data/sequences/.*$", r.descriptor["path"])
    ]

    dfs = {
        r.name:
        pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float)
        for r in sequence_resources
    }
    sequences = pd.concat(dfs.values(), axis=1)

    if how == "daily":
        hoursPerPeriod = 24
    elif how == "hourly":
        hoursPerPeriod = 1
    elif how == "weekly":
        hoursPerPeriod = 24 * 7

    aggregation = tsam.TimeSeriesAggregation(
        sequences,
        noTypicalPeriods=n,
        rescaleClusterPeriods=False,
        hoursPerPeriod=hoursPerPeriod,
        clusterMethod="hierarchical",
    )

    cluster_weights = {
        aggregation.clusterCenterIndices[n]: w
        for n, w in aggregation.clusterPeriodNoOccur.items()
    }
    if how == "daily":
        temporal = pd.Series(
            {
                d: cluster_weights[d.dayofyear]
                for d in sequences.index
                if d.dayofyear in aggregation.clusterCenterIndices
            },
            name="weighting",
        )
        temporal.index.name = "timeindex"

    elif how == "hourly":
        temporal = pd.Series(
            {
                h: cluster_weights[sequences.index.get_loc(h)]
                for h in sequences.index if sequences.index.get_loc(h) in
                aggregation.clusterCenterIndices
            },
            name="weighting",
        )
        temporal.index.name = "timeindex"

    # write resources to copied package (should not interfer with meta data)
    # as columns are not removed and sorted when written.
    os.chdir(copied_root)
    for r in sequence_resources:
        write_sequences(r.name + ".csv",
                        dfs[r.name].loc[temporal.index],
                        replace=True)

    # write temporal information from clustering
    temporal.to_csv(
        "data/temporal.csv",
        header=True,
        sep=";",
        date_format="%Y-%m-%dT%H:%M:%SZ",
    )
    # add meta data for new temporal information
    r = Resource({"path": "data/temporal.csv"})
    r.infer()
    # TODO: Add meta-data description
    r.descriptor[
        "description"] = "Temporal selection based on hierachical clustering..."

    # Update meta-data of copied package
    cp = Package("datapackage.json")
    cp.descriptor["name"] = copied_package_name
    cp.descriptor["resources"].append(r.descriptor)
    cp.commit()
    cp.save("datapackage.json")

    # set back to 'old' workdirectory
    os.chdir(cwd)

    return copied_root
Ejemplo n.º 16
0
    fields[i]["description"] = desc_list[i]

display(merops_pkg.descriptor["resources"][0]["schema"]["fields"])

# Add additional metadata to package.
merops_pkg.descriptor["keywords"] = ["peptide",
                                     "protein",
                                     "peptidase",
                                     "proteinase",
                                     "protease",
                                     "bioinformatics",
                                     "protein informatics",
                                     "MEROPS",
                                     "cleavage",
                                     "proteolysis"]

merops_pkg.descriptor["title"] = "Human peptidase families"
merops_pkg.descriptor["contributors"] = {"title": "JRMA Maasch",
                                         "role": "author"}
merops_pkg.descriptor["licenses"] = [{"name": "CC0-1.0",
                                      "title": "CC0 1.0",
                                      "path": "https://creativecommons.org/publicdomain/zero/1.0/"}]
merops_pkg.descriptor["description"] = "A dataset of human peptidase families as scraped from the MEROPS Peptidase Database in June 2020 (https://www.ebi.ac.uk/merops/index.shtml)."

# Display updated package.
display(merops_pkg.descriptor)

# Save data package.
merops_pkg.commit()
merops_pkg.save("merops_data_pkg.zip")
Ejemplo n.º 17
0
def generate_package(path_to_package):
    """Creates a datapackage in folder ``path_to_package``

    [{'fields': 'REGION', 'reference': {'resource': 'REGION', 'fields': 'VALUE'}}]
    """

    datapath = os.path.join(path_to_package)
    package = Package(base_path=datapath)

    package.infer("data/*.csv")

    package.descriptor["licenses"] = [{
        "name":
        "CC-BY-4.0",
        "path":
        "https://creativecommons.org/licenses/by/4.0/",
        "title":
        "Creative Commons Attribution 4.0",
    }]

    package.descriptor["title"] = "The OSeMOSYS Simplicity Example Model"

    package.descriptor["name"] = "osemosys_model_simplicity"

    package.descriptor["contributors"] = [{
        "title": "Will Usher",
        "email": "*****@*****.**",
        "path": "http://www.kth.se/wusher",
        "role": "author",
    }]

    package.commit()

    config = read_packaged_file("config.yaml", "otoole.preprocess")

    new_resources = []
    for resource in package.resources:

        descriptor = resource.descriptor

        name = resource.name
        if config[name]["type"] == "param":

            indices = config[name]["indices"]
            logger.debug("Indices of %s are %s", name, indices)

            foreign_keys = []
            for index in indices:
                key = {
                    "fields": index,
                    "reference": {
                        "resource": index,
                        "fields": "VALUE"
                    },
                }
                foreign_keys.append(key)

            descriptor["schema"]["foreignKeys"] = foreign_keys
            descriptor["schema"]["primaryKey"] = indices
            descriptor["schema"]["missingValues"] = [""]

        new_resources.append(descriptor)

    package.descriptor["resources"] = new_resources
    package.commit()

    filepath = os.path.join(path_to_package, "datapackage.json")
    package.save(filepath)
Ejemplo n.º 18
0
class DataStreamProcessor:
    def __init__(self):
        self.stats = {}
        self.source = None
        self.datapackage = None
        self.position = None

    def __call__(self, source=None, position=None):
        if source is None:
            source = DataStream()
        self.source = source
        self.position = position
        return self

    def process_resource(self, resource: ResourceWrapper):
        for row in resource:
            yield self.process_row(row)

    def process_resources(self, resources):
        for res in resources:
            yield self.process_resource(res)

    def process_row(self, row):
        return row

    def process_datapackage(self, dp: Package):
        return dp

    def get_res(self, current_dp, name):
        ret = self.datapackage.get_resource(name)
        if ret is None:
            ret = current_dp.get_resource(name)
        assert ret is not None
        return ret

    def get_iterator(self, datastream):
        current_dp = datastream.dp
        res_iter_ = datastream.res_iter

        def func():
            res_iter = (ResourceWrapper(self.get_res(current_dp, rw.res.name),
                                        rw.it) for rw in res_iter_)
            res_iter = self.process_resources(res_iter)
            res_iter = (it if isinstance(it, ResourceWrapper) else
                        ResourceWrapper(res, it)
                        for res, it in itertools.zip_longest(
                            self.datapackage.resources, res_iter))
            return res_iter

        return func

    def _process(self):
        datastream = self.source._process()

        try:
            self.datapackage = Package(
                descriptor=copy.deepcopy(datastream.dp.descriptor))
            self.datapackage = self.process_datapackage(self.datapackage)
            self.datapackage.commit()

            return DataStream(self.datapackage,
                              LazyIterator(self.get_iterator(datastream)),
                              datastream.stats + [self.stats])
        except Exception as exception:
            self.raise_exception(exception)

    def raise_exception(self, cause):
        if not isinstance(cause, exceptions.ProcessorError):
            error = exceptions.ProcessorError(
                cause,
                processor_name=self.__class__.__name__,
                processor_object=self,
                processor_position=self.position)
            raise error from cause
        raise cause

    def safe_process(self, on_error=None):
        results = []
        try:
            ds = self._process()
            for res in ds.res_iter:
                if on_error is not None:
                    results.append(
                        list(schema_validator(res.res, res,
                                              on_error=on_error)))
                else:
                    collections.deque(res, maxlen=0)
        except UniqueKeyError as e:
            self.raise_exception(e)
        except CastError as e:
            for err in e.errors:
                logging.error('%s', err)
        except Exception as exception:
            self.raise_exception(exception)
        return ds, results

    def process(self):
        ds, _ = self.safe_process()
        return ds.dp, ds.merge_stats()

    def results(self, on_error=None):
        if on_error is None:
            on_error = raise_exception
        ds, results = self.safe_process(on_error=on_error)
        return results, ds.dp, ds.merge_stats()
}
source = {
    'name': 'Rothamsted electronic archive (e-RA)',
    'web': 'http://www.era.rothamsted.ac.uk/Broadbalk'
}

package.descriptor['licenses'] = [licence]
package.descriptor['publishers'] = [publisher]
package.descriptor['maintainers'] = [maintainer]
package.descriptor['contributors'] = [contributor]
package.descriptor['sources'] = [source]

spatialCoverage = {
    '@type': 'Place',
    'geo': {
        '@type': 'GeoCoordinates',
        'latitude': '51.809450',
        'longitude': '-0.372898'
    }
}
package.descriptor['spatialCoverage'] = spatialCoverage
package.descriptor['latitude'] = '51.809450'
package.descriptor['longitude'] = '-0.372898'
package.descriptor['altitude'] = '130'
package.descriptor['startYear'] = '1968'
package.descriptor['endYear'] = '2018'

package.commit()
package.valid
print('done')
package.save('broadbalkWheatData.zip')
Ejemplo n.º 20
0
class DataStreamProcessor:
    def __init__(self):
        self.stats = {}
        self.source = None
        self.datapackage = None

    def __call__(self, source=None):
        if source is None:
            source = DataStream()
        self.source = source
        return self

    def process_resource(self, resource: ResourceWrapper):
        for row in resource:
            yield self.process_row(row)

    def process_resources(self, resources):
        for res in resources:
            yield self.process_resource(res)

    def process_row(self, row):
        return row

    def process_datapackage(self, dp: Package):
        return dp

    def get_res(self, current_dp, name):
        ret = self.datapackage.get_resource(name)
        if ret is None:
            ret = current_dp.get_resource(name)
        assert ret is not None
        return ret

    def get_iterator(self, datastream):
        current_dp = datastream.dp
        res_iter_ = datastream.res_iter

        def func():
            res_iter = (ResourceWrapper(self.get_res(current_dp, rw.res.name),
                                        rw.it) for rw in res_iter_)
            res_iter = self.process_resources(res_iter)
            res_iter = (it if isinstance(it, ResourceWrapper) else
                        ResourceWrapper(res, it)
                        for res, it in itertools.zip_longest(
                            self.datapackage.resources, res_iter))
            return res_iter

        return func

    def _process(self):
        datastream = self.source._process()

        self.datapackage = Package(
            descriptor=copy.deepcopy(datastream.dp.descriptor))
        self.datapackage = self.process_datapackage(self.datapackage)
        self.datapackage.commit()

        return DataStream(self.datapackage,
                          LazyIterator(self.get_iterator(datastream)),
                          datastream.stats + [self.stats])

    def process(self):
        ds = self._process()
        try:
            for res in ds.res_iter:
                collections.deque(res, maxlen=0)
        except CastError as e:
            for err in e.errors:
                logging.error('%s', err)
        return ds.dp, ds.merge_stats()

    def results(self, on_error=None):
        ds = self._process()
        results = [
            list(schema_validator(res.res, res, on_error=on_error))
            for res in ds.res_iter
        ]
        return results, ds.dp, ds.merge_stats()
Ejemplo n.º 21
0
def temporal_skip(datapackage, n, path="/tmp", name=None, *args):
    """ Creates a new datapackage by aggregating sequences inside the
    `sequence` folder of the specified datapackage by skipping `n` timesteps

    Parameters
    ----------
    datapackage: string
        String of meta data file datapackage.json
    n: integer
        Number of timesteps to skip
    path: string
        Path to directory where the aggregated datapackage is stored
    name: string
        Name of the new, aggregated datapackage. If not specified a name will
        be given
    """
    p = Package(datapackage)

    cwd = os.getcwd()

    if name is None:
        copied_package_name = (p.descriptor["name"] + "__temporal_skip__" +
                               str(n))
    else:
        copied_package_name = name

    copy_path = os.path.join(path, copied_package_name)

    copied_root = copy_datapackage(datapackage,
                                   os.path.abspath(copy_path),
                                   subset="data")

    sequence_resources = [
        r for r in p.resources
        if re.match(r"^data/sequences/.*$", r.descriptor["path"])
    ]

    dfs = {
        r.name:
        pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float)
        for r in sequence_resources
    }
    sequences = pd.concat(dfs.values(), axis=1)

    skip_sequences = sequences.loc[::n]

    temporal = pd.Series(data=n, index=skip_sequences.index, name="weighting")
    temporal.index.name = "timeindex"

    os.chdir(copied_root)

    for r in sequence_resources:
        write_sequences(r.name + ".csv",
                        dfs[r.name].loc[temporal.index],
                        replace=True)

    # write temporal information from clustering
    temporal.to_csv(
        "data/temporal.csv",
        header=True,
        sep=";",
        date_format="%Y-%m-%dT%H:%M:%SZ",
    )
    # add meta data for new temporal information
    r = Resource({"path": "data/temporal.csv"})
    r.infer()

    r.descriptor[
        "description"] = "Temporal selection based on skipped timesteps. Skipped n={}".format(
            n)

    # Update meta-data of copied package
    cp = Package("datapackage.json")
    cp.descriptor["name"] = copied_package_name
    cp.descriptor["resources"].append(r.descriptor)
    cp.commit()
    cp.save("datapackage.json")

    # set back to 'old' workdirectory
    os.chdir(cwd)

    return copied_root
Ejemplo n.º 22
0
def infer_metadata(
    package_name="default-name",
    keep_resources=False,
    foreign_keys={
        "bus": [
            "volatile",
            "dispatchable",
            "storage",
            "load",
            "reservoir",
            "shortage",
            "excess",
        ],
        "profile": ["load", "volatile", "ror"],
        "from_to_bus": ["connection", "line", "conversion"],
        "chp": ["backpressure", "extraction", "chp"],
    },
    path=None,
):
    """ Add basic meta data for a datapackage

    Parameters
    ----------
    package_name: string
        Name of the data package
    keep_resource: boolean
        Flag indicating of the resources meta data json-files should be kept
        after main datapackage.json is created. The reource meta data will
        be stored in the `resources` directory.
    foreign_keys: dict
        Dictionary with foreign key specification. Keys for dictionary are:
        'bus', 'profile', 'from_to_bus'. Values are list with
        strings with the name of the resources
    path: string
        Absoltue path to root-folder of the datapackage
    """
    current_path = os.getcwd()
    if path:
        print("Setting current work directory to {}".format(path))
        os.chdir(path)

    p = Package()
    p.descriptor["name"] = package_name
    p.descriptor["profile"] = "tabular-data-package"
    p.commit()
    if not os.path.exists("resources"):
        os.makedirs("resources")

    # create meta data resources elements
    if not os.path.exists("data/elements"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/elements"):
            r = Resource({"path": os.path.join("data/elements", f)})
            r.infer()
            r.descriptor["schema"]["primaryKey"] = "name"

            if r.name in foreign_keys.get("bus", []):
                r.descriptor["schema"]["foreignKeys"] = [{
                    "fields": "bus",
                    "reference": {
                        "resource": "bus",
                        "fields": "name"
                    },
                }]

                if r.name in foreign_keys.get("profile", []):
                    r.descriptor["schema"]["foreignKeys"].append({
                        "fields": "profile",
                        "reference": {
                            "resource": r.name + "_profile"
                        },
                    })

            elif r.name in foreign_keys.get("from_to_bus", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "from_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "to_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            elif r.name in foreign_keys.get("chp", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "fuel_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "electricity_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "heat_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    # create meta data resources elements
    if not os.path.exists("data/sequences"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/sequences"):
            r = Resource({"path": os.path.join("data/sequences", f)})
            r.infer()
            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    p.commit()
    p.save("datapackage.json")

    if not keep_resources:
        shutil.rmtree("resources")

    os.chdir(current_path)
# write each tracked kpi to its own csv file
for kpid in kpis.keys():
  filename = 'data/kpis/' + kpis[kpid][-1] + '.csv'
  with open(filename, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(data[kpid])

##
# Package KPIS (Subsets)
##
os.chdir(cwd + '/data/kpis')
kpiPackage = Package()
kpiPackage.infer('*.csv')
kpiPackage.descriptor['name'] = 'montreal-kpis'
kpiPackage.descriptor['license'] = 'https://creativecommons.org/publicdomain/zero/1.0/'
kpiPackage.commit()
kpiPackage.save(cwd + '/data/kpis/datapackage.json')
kpiPackage.save(cwd + '/data/kpis/datapackage.zip')

##
# Package Indicators (Master)
##
os.chdir(cwd + '/data/indicators')
indPackage = Package()
indPackage.basePath = cwd + '/data/indicators'
indPackage.infer(cwd + '/data/indicators/*.csv')
indPackage.descriptor['name'] = 'montreal-indicators'
indPackage.descriptor['license'] = 'https://creativecommons.org/publicdomain/zero/1.0/'
indPackage.commit()
indPackage.name = "montreal-city-indicators"
indPackage.save(cwd + '/data/indicators/datapackage.json')
Ejemplo n.º 24
0
def build(config: Dict) -> Package:
    """Builds a datapackage.Datapackage object from a config dictionary.

    The configuration dictionary should contain the following keys:
    "metadata", "files".

    Information about the corresponding study can be placed in metadata.
    Example:
        {
            'metadata': {
                'name': 'ddionrails-study',
                'id': 'doi'
            }
        }
    The desired files to be included in the Tabular Data Package can be placed in 'files':
    Example:
        {
            'files': [
                'concepts.csv'
            ]
        }

    See: examples/example-config.yml

    The resulting Tabular Data Package is written to disk as 'datapackage.json' in
    the directory the command line tool is run.

    Args:
        config: The configuration of the Datapackage to be created.

    """

    if "metadata" not in config or "files" not in config:
        raise ValueError("Config must contain 'metadata' and 'files'")

    # Read the descriptor base dictionary from disk
    # and update it with values from the config file
    descriptor = read_yaml(DATAPACKAGE_BASE_FILE)
    descriptor["name"] = config["metadata"].get("name")
    descriptor["id"] = config["metadata"].get("id")
    descriptor["title"] = config["metadata"].get("title")
    # Remove empty keys from the dictionary
    descriptor = {key: value for key, value in descriptor.items() if value}

    # Create a Datapackage object from the descriptor dictionary
    package = Package(descriptor=descriptor)
    wanted_files = [file.split(".")[0] for file in config["files"]]
    for file in wanted_files:
        # If a filename ends with "_strict"
        # create the basic Tabular Data Resource first
        # then add the "stricter" rules from the "_strict" file
        if "_strict" in file:
            basic_file = file.replace("_strict", "")
            resource = read_tabular_data_resource(basic_file)
            strict_resource = read_tabular_data_resource(file)
            merge(resource, strict_resource)
        else:
            resource = read_tabular_data_resource(file)
        package.add_resource(resource)
    package.commit()
    if not package.valid:
        for error in package.errors:
            LOGGER.error(error)
    return package
Ejemplo n.º 25
0
from datapackage import Package

# Init
package = Package()

# Infer
package.infer('**/*.csv')
print(package.descriptor)

# Tweak
package.descriptor['resources'][1]['schema']['fields'][1]['type'] = 'year'
package.commit()
print(package.valid) # true

# Read
print(package.get_resource('population').read(keyed=True))
#[ { city: 'london', year: 2017, population: 8780000 },
#  { city: 'paris', year: 2017, population: 2240000 },
#  { city: 'rome', year: 2017, population: 2860000 } ]

# Save
package.save('tmp/datapackage.zip')

# Load
package = Package('tmp/datapackage.zip', base_path='tmp')
print(package.descriptor)
def data_package_from_dataset(row: dict) -> Tuple[str, Package]:
    """
    Make a data package definition from a dataset row.
    """
    assert "dataset" == row["Type"], row
    uid: str = row["U ID"]

    # Initialise the data package from the CSV data.
    package = Package()
    csv_path = f"raw-csv/{uid}.csv"
    package.infer(csv_path)

    # Set a more readable name
    package.descriptor["name"] = derive_name(row)

    # Update standard descriptor fields from the row metadata.
    package.descriptor["title"] = row["Name"]
    package.descriptor["description"] = row["Description"]
    # Sources require a title: fall back to using the link for it, otherwise skip it.
    source_title: Optional[str] = (
        row["data_provided_by"]
        if row["data_provided_by"]
        else row["source_link"]
        if row["source_link"]
        else None
    )
    if source_title:
        package.descriptor["sources"] = [
            {
                "title": source_title,
                **({"path": row["source_link"]} if row["source_link"] else {}),
            }
        ]
    package.descriptor["contributors"] = [
        {
            "title": row["Owner"],
            # XXX: Ugly but compact.
            **({"email": row["Contact Email"]} if row["Contact Email"] else {}),
        }
    ]
    keywords: List[str] = row["Keywords"].split(",")
    package.descriptor["keywords"] = keywords
    # Example value: "09/22/2014 05:34:00 PM +0000"
    socrata_datetime_format = "%m/%d/%Y %H:%M:%S %p %z"
    created: dt.datetime = dt.datetime.strptime(
        row["Creation Date"], socrata_datetime_format
    )
    package.descriptor["created"] = created.isoformat()

    # XXX: Update non-standard descriptor fields from the row metadata.
    # (Prefix these with "x_" to flag non-standard status.)
    if row["License"]:
        # TODO: Use licenses field instead
        package.descriptor["x_license_name"] = row["License"]
    if row["Category"]:
        package.descriptor["x_category"] = row["Category"]

    success = package.commit()
    assert success, package.descriptor

    # Check descriptor, and return.
    descriptor: dict = package.descriptor
    assert "tabular-data-package" == descriptor["profile"], descriptor
    assert 1 == len(descriptor["resources"]), descriptor["resources"]
    [resource] = descriptor["resources"]
    assert csv_path == resource["path"], resource
    # assert False, descriptor
    assert package.valid, package.errors
    return (uid, package)
from datapackage import Package

p = Package()

p.infer('*.csv')

p.descriptor['title'] = 'Openmod-Example'
p.descriptor['spatial'] = 'Random'
p.descriptor['sources'] = 'Created by hand'

p.commit()

p.save('datapackage.json')