Beispiel #1
0
class iterable_loader(DataStreamProcessor):
    def __init__(self, iterable, name=None):
        super(iterable_loader, self).__init__()
        self.iterable = iterable
        self.name = name

    def handle_iterable(self):
        mode = None
        for x in self.iterable:
            if mode is None:
                assert isinstance(x, (dict, list))
                mode = dict if isinstance(x, dict) else list
            assert isinstance(x, mode)
            if mode == dict:
                yield x
            else:
                yield dict(zip(('col{}'.format(i) for i in range(len(x))), x))

    def process_datapackage(self, dp: Package):
        name = self.name
        if name is None:
            name = 'res_{}'.format(len(dp.resources) + 1)
        self.res = Resource(dict(name=name, path='{}.csv'.format(name)),
                            storage=iterable_storage(self.handle_iterable()))
        self.res.infer()
        dp.descriptor.setdefault('resources', []).append(self.res.descriptor)
        return dp

    def process_resources(self, resources):
        yield from super(iterable_loader, self).process_resources(resources)
        yield self.res.iter(keyed=True)
Beispiel #2
0
def save_as_data_packages(row):
    """ save dataset from data.json as data package
        We will use this files as a queue to process later """
    # TODO check if ckanext-datapackager is useful for import
    # or export resources:
    # https://github.com/frictionlessdata/ckanext-datapackager

    package = Package()

    # TODO check this, I'm learning datapackages.
    resource = Resource({'data': row})
    resource.infer()  # adds "name": "inline"
    if not resource.valid:
        raise Exception('Invalid resource')

    encoded_identifier = encode_identifier(identifier=row['identifier'])

    # resource_path = os.path.join(path, f'{prefix}_{encoded_identifier}.json')
    # resource.save(resource_path)

    package.add_resource(descriptor=resource.descriptor)
    folder = config.get_data_packages_folder_path()
    filename = f'data-json-{encoded_identifier}.json'
    package_path = os.path.join(folder, filename)

    # no not rewrite if exists
    if not os.path.isfile(package_path):
        package.save(target=package_path)
Beispiel #3
0
    def get_resource(self, idx=None, path=None, name=None, source_only=True):
        if idx is None:
            if path:
                all_paths = [
                    r.descriptor.get("path") for r in self.datapackage.resources
                ]
                if path in all_paths:
                    idx = all_paths.index(path)
                else:
                    logger.error(f"path = {path} is not in resources.")
            elif name:
                all_names = [
                    r.descriptor.get("name") for r in self.datapackage.resources
                ]
                if name in all_names:
                    idx = all_names.index(name)
                else:
                    logger.error(f"name = {name} is not in resources.")
            else:
                raise Exception(
                    f"Please specify at least one of the keywords: idx, path, name."
                )

        if self.is_local:
            logger.debug(
                f"Using local dataset for {self.id}, sync it if you need the updated version."
            )
            r = self.datapackage.resources[idx]
            logger.debug(f"using base_path: {str(self.base_path)}")
            logger.debug(f"using descriptor: {r.descriptor}")
            resource = Resource(r.descriptor, base_path=str(self.base_path))
            logger.debug(f"base_path of r_1: {resource._Resource__base_path}")
        elif (not self.is_local) and (self.source == "git"):
            logger.debug(f"Using remote data")
            self.remote_path = f"{self.metadata_uri[:-16]}"
            r = self.datapackage.resources[idx]
            resource = Resource(
                {
                    **(r.descriptor),
                    **{"path": self.remote_path + r.descriptor.get("path", "")},
                }
            )
        elif (not self.is_local) and (self.source == "s3"):
            logger.debug(f"Using remote data")
            logger.debug(
                f"Direct resource from S3 is not supported yet. "
                f"Please sync the dataset to local using the command line first.\n"
                f"TODO: Sync S3 to local after confirmation from here."
            )
            resource = self.datapackage.resources[idx]
        else:
            logger.error("Resource is not supported. Currently supporting S3 and git.")
            resource = self.datapackage.resources[idx]

        if source_only:
            return resource.source
        else:
            return resource
Beispiel #4
0
 def process_datapackage(self, dp: Package):
     name = self.name
     if name is None:
         name = 'res_{}'.format(len(dp.resources) + 1)
     self.res = Resource(dict(name=name, path='{}.csv'.format(name)),
                         storage=iterable_storage(self.handle_iterable()))
     self.res.infer()
     dp.descriptor.setdefault('resources', []).append(self.res.descriptor)
     return dp
Beispiel #5
0
 def process_datapackage(self, dp: Package):
     if isinstance(self.load_source, tuple):
         datapackage_descriptor, _ = self.load_source
         dp.descriptor.setdefault('resources', [])
         self.resource_matcher = ResourceMatcher(self.resources,
                                                 datapackage_descriptor)
         for resource_descriptor in datapackage_descriptor['resources']:
             if self.resource_matcher.match(resource_descriptor['name']):
                 dp.add_resource(resource_descriptor)
     else:  # load_source is string:
         if self.load_source.startswith('env://'):
             env_var = self.load_source[6:]
             self.load_source = os.environ.get(env_var)
             if self.load_source is None:
                 raise ValueError(
                     f"Couldn't find value for env var '{env_var}'")
         if os.path.basename(self.load_source) == 'datapackage.json':
             self.load_dp = Package(self.load_source)
             self.resource_matcher = ResourceMatcher(
                 self.resources, self.load_dp)
             dp.descriptor.setdefault('resources', [])
             for resource in self.load_dp.resources:
                 if self.resource_matcher.match(resource.name):
                     dp.add_resource(resource.descriptor)
         else:
             if os.path.exists(self.load_source):
                 base_path = os.path.dirname(self.load_source) or '.'
                 self.load_source = os.path.basename(self.load_source)
             else:
                 base_path = None
             descriptor = dict(path=self.load_source,
                               profile='tabular-data-resource')
             descriptor['format'] = self.options.get('format')
             if 'encoding' in self.options:
                 descriptor['encoding'] = self.options['encoding']
             if descriptor['format'] == 'xml' or self.load_source.endswith(
                     '.xml'):
                 self.options.setdefault('custom_parsers',
                                         {})['xml'] = XMLParser
             self.options.setdefault('ignore_blank_headers', True)
             self.options.setdefault('headers', 1)
             self.res = Resource(descriptor,
                                 base_path=base_path,
                                 **self.options)
             self.res.infer(confidence=1, limit=1000)
             if self.name is not None:
                 self.res.descriptor['name'] = self.name
             if self.force_strings:
                 for f in self.res.descriptor['schema']['fields']:
                     f['type'] = 'string'
             self.res.commit()
             self.res.descriptor['path'] = '{name}.{format}'.format(
                 **self.res.descriptor)
             dp.add_resource(self.res.descriptor)
     return dp
Beispiel #6
0
 def test_raw_data(self, simpsons_dataset, simpsons_datapackage,
                   simpsons_descriptor_path):
     for r in simpsons_datapackage.resources:
         resource = Resource(
             r.descriptor, base_path=path.dirname(simpsons_descriptor_path))
         once = simpsons_dataset.raw_data[r.descriptor['name']]
         twice = simpsons_dataset.raw_data[r.descriptor['name']]
         assert_that(once, equal_to(resource.raw_read()))
         # Not a generator
         for _ in once:
             pass  # Consume iterable
         assert_that(once, equal_to(twice))
def package_from_resources(resource_path, output_path, clean=True):
    """ Collects resource descriptors and merges them in a datapackage.json

    Parameters
    ----------
    resource_path: string
        Path to directory with resources (in .json format)
    output_path: string
        Root path of datapackage where the newly created datapckage.json is
        stored
    clean: boolean
        If true, resources will be deleted
    """
    p = Package()

    p.descriptor["profile"] = "tabular-data-package"
    p.commit()

    for f in os.listdir(resource_path):
        path = os.path.join(resource_path, f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    if clean:
        os.rmdir(resource_path)

    p.save(os.path.join(output_path, "datapackage.json"))
def _make_package(source, publisher, config):

    os.chdir(source)
    files = [f for f in os.listdir('data') if f.endswith('.csv')]
    package = Package({'publisher': publisher})

    for f in files:
        path = f"data/{f}"
        name = f.replace('.csv', '')
        schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json"
        resource = Resource({'path': path, 'schema': schema})
        package.add_resource(resource.descriptor)

    package.commit()
    package.infer()

    errors = False
    for r in package.resources:
        try:
            r.read(keyed=True)
            r.check_relations()
        except (CastError, RelationError) as e:
            print('Error in', os.path.join(source, r.descriptor['path']))
            print(e, e.errors)
            errors = True
    if not errors:
        package.save('datapackage.zip')
        print('saved datapackage.json to', source)

        s3 = boto3.client(
            's3',
            aws_access_key_id=config['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY'])

        bucket = 'developer-contributions-datapackages'
        key = f'{publisher}/{uuid.uuid4()}/datapackage.zip'
        s3.upload_file(f'{source}/datapackage.zip',
                       bucket,
                       key,
                       ExtraArgs={'ACL': 'public-read'})

        config = s3._client_config
        config.signature_version = botocore.UNSIGNED

        datapackage_url = boto3.resource(
            's3',
            config=config).meta.client.generate_presigned_url('get_object',
                                                              ExpiresIn=0,
                                                              Params={
                                                                  'Bucket':
                                                                  bucket,
                                                                  'Key': key
                                                              })

        return datapackage_url
Beispiel #9
0
    def save_datasets_as_data_packages(self, folder_path):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.package_list:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            identifier = dataset['id']
            bytes_identifier = identifier.encode('utf-8')
            encoded = base64.b64encode(bytes_identifier)
            encoded_identifier = str(encoded, "utf-8")

            resource_path = os.path.join(
                folder_path, f'resource_ckan_api_{encoded_identifier}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(
                folder_path, f'pkg_ckan_api_{encoded_identifier}.zip')
            package.save(target=package_path)
Beispiel #10
0
    def save_datasets_as_data_packages(self, folder_path):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.datasets:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            #FIXME identifier uses incompables characthers as paths (e.g. /).
            # could exist duplicates paths from different resources
            # use BASE64 or hashes
            idf = slugify(dataset['identifier'])

            resource_path = os.path.join(folder_path,
                                         f'resource_data_json_{idf}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(folder_path,
                                        f'pkg_data_json_{idf}.zip')
            package.save(target=package_path)
Beispiel #11
0
def update_package_descriptor():
    """
    """
    p = Package("datapackage.json")

    for f in os.listdir("resources"):
        path = os.path.join("resources", f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    os.rmdir("resources")

    p.save("datapackage.json")
Beispiel #12
0
def infer_resources(directory="data/elements"):
    """ Method looks at all files in `directory` and creates
    datapackage.Resource object that will be stored

    Parameters
    ----------
    directory: string
        Path to directory from where resources are inferred

    """
    if not os.path.exists("resources"):
        os.makedirs("resources")

    # create meta data resources
    for f in os.listdir(directory):
        r = Resource({"path": os.path.join(directory, f)})
        r.infer()
        r.save(os.path.join("resources", f.replace(".csv", ".json")))
    def save_datasets_as_data_packages(self, folder_path, identifier_field):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.datasets:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            idf = slugify(dataset[identifier_field])

            resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip')
            package.save(target=package_path)
Beispiel #14
0
def infer_metadata(
    package_name="default-name",
    keep_resources=False,
    foreign_keys={
        "bus": [
            "volatile",
            "dispatchable",
            "storage",
            "load",
            "reservoir",
            "shortage",
            "excess",
        ],
        "profile": ["load", "volatile", "ror"],
        "from_to_bus": ["connection", "line", "conversion"],
        "chp": ["backpressure", "extraction", "chp"],
    },
    path=None,
):
    """ Add basic meta data for a datapackage

    Parameters
    ----------
    package_name: string
        Name of the data package
    keep_resource: boolean
        Flag indicating of the resources meta data json-files should be kept
        after main datapackage.json is created. The reource meta data will
        be stored in the `resources` directory.
    foreign_keys: dict
        Dictionary with foreign key specification. Keys for dictionary are:
        'bus', 'profile', 'from_to_bus'. Values are list with
        strings with the name of the resources
    path: string
        Absoltue path to root-folder of the datapackage
    """
    current_path = os.getcwd()
    if path:
        print("Setting current work directory to {}".format(path))
        os.chdir(path)

    p = Package()
    p.descriptor["name"] = package_name
    p.descriptor["profile"] = "tabular-data-package"
    p.commit()
    if not os.path.exists("resources"):
        os.makedirs("resources")

    # create meta data resources elements
    if not os.path.exists("data/elements"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/elements"):
            r = Resource({"path": os.path.join("data/elements", f)})
            r.infer()
            r.descriptor["schema"]["primaryKey"] = "name"

            if r.name in foreign_keys.get("bus", []):
                r.descriptor["schema"]["foreignKeys"] = [{
                    "fields": "bus",
                    "reference": {
                        "resource": "bus",
                        "fields": "name"
                    },
                }]

                if r.name in foreign_keys.get("profile", []):
                    r.descriptor["schema"]["foreignKeys"].append({
                        "fields": "profile",
                        "reference": {
                            "resource": r.name + "_profile"
                        },
                    })

            elif r.name in foreign_keys.get("from_to_bus", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "from_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "to_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            elif r.name in foreign_keys.get("chp", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "fuel_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "electricity_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "heat_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    # create meta data resources elements
    if not os.path.exists("data/sequences"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/sequences"):
            r = Resource({"path": os.path.join("data/sequences", f)})
            r.infer()
            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    p.commit()
    p.save("datapackage.json")

    if not keep_resources:
        shutil.rmtree("resources")

    os.chdir(current_path)
def create_resource(path):
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'timeindex'
    resource.descriptor['description'] = (
        'Profiles for Run of River (ROR) components. The profile is assumed' +
        ' to be constant during the year.')
    resource.descriptor['title'] = 'ROR profiles'
    resource.descriptor['sources'] = [{'title': 'Assumption'}]
    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
Beispiel #16
0
def create_resource(path):
    """
    """

    mapper = {}

    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'name'
    resource.descriptor[
        'description'] = 'Installed capacities, costs and technical parameters for components'
    resource.descriptor['title'] = '{} components'.format(
        resource.name.title())
    resource.descriptor['sources'] = [{
        'title':
        'E-Highway 2050 installed capacities',
        'path':
        'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx'
    }]

    resource.descriptor['schema']['foreignKeys'] = [{
        "fields": "bus",
        "reference": {
            "resource": "bus",
            "fields": "name"
        }
    }]

    if 'demand' in resource.name:
        resource.descriptor['schema']['foreignKeys'].append({
            "fields": "profile",
            "reference": {
                "resource": "demand-profiles"
            }
        })

    elif 'volatile-generator' in resource.name:
        resource.descriptor['schema']['foreignKeys'].append({
            "fields": "profile",
            "reference": {
                "resource": "generator-profiles"
            }
        })

    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
    else:
        print('Resource is not valid, writing resource anyway...')
        resource.save('resources/' + resource.name + '.json')
Beispiel #17
0
            herb = herb.get("herb")
            return herb
        else:
            logger.error(f"Could not find herb {id}")
            return


if __name__ == "__main__":

    from datapackage import Resource

    fl = Flora(flora="/Users/leima/dataherb/flora/flora.json")

    hb = fl.herb("git-data-science-job")

    print(f"herb base_path: {hb.base_path}")

    rs = hb.resources[0]

    rs_1 = Resource(rs.descriptor, base_path=str(hb.base_path))

    print(f"{rs.tabular}")

    # rs_2.read()

    rs.read()

    print(hb.get_resource(path="dataset/stackoverflow_job_listing.csv"))

    logger.debug("End of Game")
Beispiel #18
0
    'Pinst': 'installed capacity in MW',
    'efactor': 'energy that can be gained from the water kwh/m3',
    'head': 'difference in altitude in m',
    'total_flow': 'inflow to the power plant in mio m3',
    'flo_river_ror': 'next downstream res_nr',
    'status': 'operational status of the plant',
    'company': None,
    'turbtype': 'optional: turbine type',
    'geodbid': 'specified id for geo referencing',
    'river': 'river in which the plant is located',
    'river_km': 'km from stream source',
    'level_meter': 'assigned level meter for flow curve'
}

# create resource
r = Resource({'path': 'data/runofriver.csv'})

# get basic metadata from data
r.infer()

# add description for fields based on mapper
for i in range(len(r.descriptor['schema']['fields'])):
    r.descriptor['schema']['fields'][i]['description'] = \
        description_mapper[r.descriptor['schema']['fields'][i]['name']]

# commit (apply) changes to resource
r.commit()

# save the resource
r.save('dataresource.json')
Beispiel #19
0
def create_resource(path, title):
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'name'
    resource.descriptor[
        'description'] = 'Installed capacities, costs and technical parameters for components'
    resource.descriptor['title'] = title
    resource.descriptor['sources'] = [{
        'title':
        'Restore 2050 hydro inflow timeseries',
        'path':
        'https://zenodo.org/record/804244/files/Hydro_Inflow.zip'
    }, {
        'title':
        'E-Highway 2050 installed capacities',
        'path':
        'http://www.e-highway2050.eu/fileadmin/documents/Results/e-Highway2050_2050_Country_and_cluster_installed_capacities_31-03-2015.xlsx'
    }, {
        'title':
        'DIW Berlin - Current and Prospective Costs of Electricity Generation until 2050',
        'path':
        'https://www.diw.de/documents/publikationen/73/diw_01.c.424566.de/diw_datadoc_2013-068.pdf'
    }]

    resource.descriptor['schema']['foreignKeys'] = [{
        "fields": "bus",
        "reference": {
            "resource": "bus",
            "fields": "name"
        }
    }]

    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
def create_resource(path):
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'timeindex'
    resource.descriptor['description'] = 'Demand profiles per country'
    resource.descriptor['title'] = 'Demand profiles'
    resource.descriptor['sources'] = [{
        'title':
        'OPSD timeseries',
        'path':
        'https://data.open-power-system-data.org/time_series/2017-07-09/' +
        'time_series_60min_singleindex.csv'
    }]
    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
Beispiel #21
0
def temporal_clustering(datapackage, n, path="/tmp", how="daily"):
    """ Creates a new datapackage by aggregating sequences inside the
    `sequence` folder of the specified datapackage by clustering `n` timesteps

    Parameters
    ----------
    datapackage: string
        String of meta data file datapackage.json
    n: integer
        Number of clusters
    path: string
        Path to directory where the aggregated datapackage is stored
    how: string
        How to cluster 'daily' or 'hourly'
    """
    if how == "weekly":
        raise NotImplementedError("Weekly clustering is not implemented!")

    p = Package(datapackage)

    cwd = os.getcwd()

    copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" +
                           how + "_" + str(n))

    copy_path = os.path.join(path, p.descriptor["name"], copied_package_name)

    copied_root = copy_datapackage(datapackage,
                                   os.path.abspath(copy_path),
                                   subset="data")

    sequence_resources = [
        r for r in p.resources
        if re.match(r"^data/sequences/.*$", r.descriptor["path"])
    ]

    dfs = {
        r.name:
        pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float)
        for r in sequence_resources
    }
    sequences = pd.concat(dfs.values(), axis=1)

    if how == "daily":
        hoursPerPeriod = 24
    elif how == "hourly":
        hoursPerPeriod = 1
    elif how == "weekly":
        hoursPerPeriod = 24 * 7

    aggregation = tsam.TimeSeriesAggregation(
        sequences,
        noTypicalPeriods=n,
        rescaleClusterPeriods=False,
        hoursPerPeriod=hoursPerPeriod,
        clusterMethod="hierarchical",
    )

    cluster_weights = {
        aggregation.clusterCenterIndices[n]: w
        for n, w in aggregation.clusterPeriodNoOccur.items()
    }
    if how == "daily":
        temporal = pd.Series(
            {
                d: cluster_weights[d.dayofyear]
                for d in sequences.index
                if d.dayofyear in aggregation.clusterCenterIndices
            },
            name="weighting",
        )
        temporal.index.name = "timeindex"

    elif how == "hourly":
        temporal = pd.Series(
            {
                h: cluster_weights[sequences.index.get_loc(h)]
                for h in sequences.index if sequences.index.get_loc(h) in
                aggregation.clusterCenterIndices
            },
            name="weighting",
        )
        temporal.index.name = "timeindex"

    # write resources to copied package (should not interfer with meta data)
    # as columns are not removed and sorted when written.
    os.chdir(copied_root)
    for r in sequence_resources:
        write_sequences(r.name + ".csv",
                        dfs[r.name].loc[temporal.index],
                        replace=True)

    # write temporal information from clustering
    temporal.to_csv(
        "data/temporal.csv",
        header=True,
        sep=";",
        date_format="%Y-%m-%dT%H:%M:%SZ",
    )
    # add meta data for new temporal information
    r = Resource({"path": "data/temporal.csv"})
    r.infer()
    # TODO: Add meta-data description
    r.descriptor[
        "description"] = "Temporal selection based on hierachical clustering..."

    # Update meta-data of copied package
    cp = Package("datapackage.json")
    cp.descriptor["name"] = copied_package_name
    cp.descriptor["resources"].append(r.descriptor)
    cp.commit()
    cp.save("datapackage.json")

    # set back to 'old' workdirectory
    os.chdir(cwd)

    return copied_root
# -*- coding: utf-8 -*-
"""
"""
import os
from datapackage import Package, Resource

p = Package('datapackage.json')

p.descriptor['profile'] = 'tabular-data-package'

for f in os.listdir('resources'):
    path = os.path.join('resources', f)

    r = Resource(path)

    p.add_resource(r.descriptor)

    p.commit()

    os.remove(path)

os.rmdir('resources')

p.save('datapackage.json')
Beispiel #23
0
from datapackage import Resource

# Create
resource = Resource({'path': 'data/data.csv'})
resource.tabular # true
resource.headers # ['city', 'location']
print(resource.read(keyed=True))
# [
#   {city: 'london', location: '51.50,-0.11'},
#   {city: 'paris', location: '48.85,2.30'},
#   {city: 'rome', location: 'N/A'},
# ]

# Infer
resource.infer()
print(resource.descriptor)
#{ path: 'data.csv',
#  profile: 'tabular-data-resource',
#  encoding: 'utf-8',
#  name: 'data',
#  format: 'csv',
#  mediatype: 'text/csv',
# schema: { fields: [ [Object], [Object] ], missingValues: [ '' ] } }
# resource.read(keyed=True)
# Fails with a data validation error

# Tweak
resource.descriptor['schema']['missingValues'] = 'N/A'
resource.commit()
resource.valid # False
print(resource.errors)
def create_resource(path):
    """
    """
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'name'
    resource.descriptor[
        'description'] = 'Contains the hubs (nodes) for the energy system representation'
    resource.descriptor[
        'title'] = 'Energy system hubs for DE and its electrical neighbours'
    resource.descriptor['sources'] = [{
        'title':
        'NUTS Shapefiles',
        'path':
        'http://ec.europa.eu/eurostat/cache/GISCO/geodatafiles/NUTS_2013_10M_SH.zip',
        'files': [
            'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.shp',
            'NUTS_2013_10M_SH/data/NUTS_RG_10M_2013.dbf'
        ]
    }]
    resource.commit()
    resource.descriptor

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
def create_resource(path):
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'name'
    resource.descriptor[
        'description'] = 'Installed transmission capacities from the e-highway 2050 scenario'
    resource.descriptor['title'] = 'Installed transmission capacities'
    resource.descriptor['sources'] = [{
        'title':
        'E-Highway 2050 transmission capacities',
        'path':
        'http://www.e-highway2050.eu/fileadmin/documents/' +
        'Results/e-Highway_database_per_country-08022016.xlsx'
    }]

    resource.descriptor['schema']['foreignKeys'] = [{
        "fields": "from_bus",
        "reference": {
            "resource": "bus",
            "fields": "name"
        }
    }, {
        "fields": "to_bus",
        "reference": {
            "resource": "bus",
            "fields": "name"
        }
    }]

    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
def create_resource(path):
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'timeindex'
    resource.descriptor[
        'description'] = 'PV profiles (capacity factors) from renewables ninja for each country'
    resource.descriptor['title'] = 'PV profiles'
    resource.descriptor['sources'] = [{
        'title':
        'Renewables Ninja PV Capacity Factors',
        'path':
        'https://www.renewables.ninja/static/downloads/ninja_europe_pv_v1.1.zip'
    }]
    resource.commit()

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
Beispiel #27
0
def temporal_skip(datapackage, n, path="/tmp", name=None, *args):
    """ Creates a new datapackage by aggregating sequences inside the
    `sequence` folder of the specified datapackage by skipping `n` timesteps

    Parameters
    ----------
    datapackage: string
        String of meta data file datapackage.json
    n: integer
        Number of timesteps to skip
    path: string
        Path to directory where the aggregated datapackage is stored
    name: string
        Name of the new, aggregated datapackage. If not specified a name will
        be given
    """
    p = Package(datapackage)

    cwd = os.getcwd()

    if name is None:
        copied_package_name = (p.descriptor["name"] + "__temporal_skip__" +
                               str(n))
    else:
        copied_package_name = name

    copy_path = os.path.join(path, copied_package_name)

    copied_root = copy_datapackage(datapackage,
                                   os.path.abspath(copy_path),
                                   subset="data")

    sequence_resources = [
        r for r in p.resources
        if re.match(r"^data/sequences/.*$", r.descriptor["path"])
    ]

    dfs = {
        r.name:
        pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float)
        for r in sequence_resources
    }
    sequences = pd.concat(dfs.values(), axis=1)

    skip_sequences = sequences.loc[::n]

    temporal = pd.Series(data=n, index=skip_sequences.index, name="weighting")
    temporal.index.name = "timeindex"

    os.chdir(copied_root)

    for r in sequence_resources:
        write_sequences(r.name + ".csv",
                        dfs[r.name].loc[temporal.index],
                        replace=True)

    # write temporal information from clustering
    temporal.to_csv(
        "data/temporal.csv",
        header=True,
        sep=";",
        date_format="%Y-%m-%dT%H:%M:%SZ",
    )
    # add meta data for new temporal information
    r = Resource({"path": "data/temporal.csv"})
    r.infer()

    r.descriptor[
        "description"] = "Temporal selection based on skipped timesteps. Skipped n={}".format(
            n)

    # Update meta-data of copied package
    cp = Package("datapackage.json")
    cp.descriptor["name"] = copied_package_name
    cp.descriptor["resources"].append(r.descriptor)
    cp.commit()
    cp.save("datapackage.json")

    # set back to 'old' workdirectory
    os.chdir(cwd)

    return copied_root
Beispiel #28
0
def create_resource(path):
    """
    """
    from datapackage import Resource
    resource = Resource({'path': path})
    resource.infer()
    resource.descriptor['schema']['primaryKey'] = 'name'
    resource.descriptor[
        'description'] = 'Excess slacks for each electricity hub in the energy system representation'
    resource.descriptor[
        'title'] = 'Excess slacks for DE and its electrical neighbours'

    resource.descriptor['schema']['foreignKeys'] = [{
        "fields": "bus",
        "reference": {
            "resource": "bus",
            "fields": "name"
        }
    }]

    resource.commit()
    resource.descriptor

    if resource.valid:
        resource.save('resources/' + resource.name + '.json')
Beispiel #29
0
class load(DataStreamProcessor):
    def __init__(self,
                 load_source,
                 name=None,
                 resources=None,
                 validate=False,
                 strip=True,
                 **options):
        super(load, self).__init__()
        self.load_source = load_source
        self.options = options
        self.name = name
        self.resources = resources
        self.load_dp = None
        self.validate = validate
        self.strip = strip
        self.force_strings = options.get('force_strings') is True

    def process_datapackage(self, dp: Package):
        if isinstance(self.load_source, tuple):
            datapackage_descriptor, _ = self.load_source
            dp.descriptor.setdefault('resources', [])
            self.resource_matcher = ResourceMatcher(self.resources,
                                                    datapackage_descriptor)
            for resource_descriptor in datapackage_descriptor['resources']:
                if self.resource_matcher.match(resource_descriptor['name']):
                    dp.add_resource(resource_descriptor)
        else:  # load_source is string:
            if self.load_source.startswith('env://'):
                env_var = self.load_source[6:]
                self.load_source = os.environ.get(env_var)
                if self.load_source is None:
                    raise ValueError(
                        f"Couldn't find value for env var '{env_var}'")
            if os.path.basename(self.load_source) == 'datapackage.json':
                self.load_dp = Package(self.load_source)
                self.resource_matcher = ResourceMatcher(
                    self.resources, self.load_dp)
                dp.descriptor.setdefault('resources', [])
                for resource in self.load_dp.resources:
                    if self.resource_matcher.match(resource.name):
                        dp.add_resource(resource.descriptor)
            else:
                if os.path.exists(self.load_source):
                    base_path = os.path.dirname(self.load_source) or '.'
                    self.load_source = os.path.basename(self.load_source)
                else:
                    base_path = None
                descriptor = dict(path=self.load_source,
                                  profile='tabular-data-resource')
                descriptor['format'] = self.options.get('format')
                if 'encoding' in self.options:
                    descriptor['encoding'] = self.options['encoding']
                if descriptor['format'] == 'xml' or self.load_source.endswith(
                        '.xml'):
                    self.options.setdefault('custom_parsers',
                                            {})['xml'] = XMLParser
                self.options.setdefault('ignore_blank_headers', True)
                self.options.setdefault('headers', 1)
                self.res = Resource(descriptor,
                                    base_path=base_path,
                                    **self.options)
                self.res.infer(confidence=1, limit=1000)
                if self.name is not None:
                    self.res.descriptor['name'] = self.name
                if self.force_strings:
                    for f in self.res.descriptor['schema']['fields']:
                        f['type'] = 'string'
                self.res.commit()
                self.res.descriptor['path'] = '{name}.{format}'.format(
                    **self.res.descriptor)
                dp.add_resource(self.res.descriptor)
        return dp

    def stripper(self, iterator):
        for r in iterator:
            yield dict((k, v.strip()) if isinstance(v, str) else (k, v)
                       for k, v in r.items())

    def process_resources(self, resources):
        yield from super(load, self).process_resources(resources)
        if isinstance(self.load_source, tuple):
            datapackage_descriptor, resources = self.load_source
            yield from (resource for resource, descriptor in zip(
                resources, datapackage_descriptor['resources'])
                        if self.resource_matcher.match(descriptor['name']))
        elif self.load_dp is not None:
            yield from (resource.iter(keyed=True)
                        for resource in self.load_dp.resources
                        if self.resource_matcher.match(resource.name))
        else:
            it = self.res.iter(keyed=True, cast=False)
            if self.validate:
                it = schema_validator(self.res, it)
            if self.strip:
                it = self.stripper(it)
            yield it