Beispiel #1
0
def get_data_co2_temp():

    data_url = 'https://datahub.io/core/global-temp/datapackage.json'

    # to load Data Package into storage
    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    data_temperature = []

    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])
            data_temperature.append(data)

    data_temperature_annual = data_temperature[0]
    data_temperature_annual = data_temperature_annual[
        data_temperature_annual['Year'] >= 1980]
    data_temperature_annual = data_temperature_annual.sort_values(by=['Year'])
    data_temperature_annual = data_temperature_annual[
        data_temperature_annual['Source'] == 'GISTEMP']

    #get data co2
    data_url = 'https://datahub.io/core/co2-ppm/datapackage.json'

    # to load Data Package into storage
    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    data_co2 = []
    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])
            data_co2.append(data)

    data_co2_annual = data_co2[-2]
    data_co2_annual = data_co2_annual[data_co2_annual['Year'] < 2017]

    data_annual_gistemp = data_co2_annual.rename(columns={'Mean': 'CO2'})
    data_annual_gistemp = pd.merge(
        data_annual_gistemp, data_temperature_annual,
        on='Year').rename(columns={'Mean': 'Temperature'})
    data_annual_gistemp['Group'] = 0
    i = 1
    for year in range(1980, 2016, 6):
        if year != 2010:
            data_annual_gistemp.loc[(data_annual_gistemp['Year'] >= year) &
                                    (data_annual_gistemp['Year'] <=
                                     (year + 5)),
                                    'Group'] = str(year) + '-' + str(year + 5)
        else:
            data_annual_gistemp.loc[(data_annual_gistemp['Year'] >= year) &
                                    (data_annual_gistemp['Year'] <=
                                     (year + 6)),
                                    'Group'] = str(year) + '-' + str(year + 6)
        i += 1
    return data_annual_gistemp
def test_uk_steel_delivered_to_uk_is_consistent():
    """Two tables provide this: (A) the difference between UK production and
    exports in Tables 16 and 17; and (B) Table 18 "supply to UK market".
    This test checks they match.

    """

    # Load data
    p = datapackage.Package(DATAPACKAGE)
    dfs = {r.name: pd.DataFrame(r.read(keyed=True)) for r in p.resources}

    # Version A: production - exports
    prod1 = dfs['production_ecsc'].set_index(['year', 'product'])['mass']
    prod2 = dfs['production_derived'].set_index(['year', 'product'])['mass']
    exports = dfs['exports'].set_index(['year', 'product'])['mass']
    prod = pd.concat([prod1, prod2])
    A = (prod - exports).loc[2016]

    # Version B: supply
    supply = dfs['supply'].groupby(['product']).sum()
    B = supply[['uk_production_to_stockholders', 'uk_production_to_industry']] \
        .sum(axis='columns')

    # Compare
    df = pd.concat({'A': A, 'B': B}, axis=1)
    df['diff'] = df['A'].astype(float) - df['B']

    problems = (abs(df['diff']) > 0.5) | (pd.isnull(df['diff']))
    assert not any(problems), \
        'Differences found:\n\n%s\n' % df[problems]
Beispiel #3
0
    def __init__(self, descriptor_file):

        self._datapackage = datapackage.Package(descriptor_file)

        self.__descriptor_file = descriptor_file
        self.__base_path = os.path.dirname(
            os.path.abspath(self.__descriptor_file))

        # Index resources by name
        self.__resources = {r.descriptor['name']: r
                            for r in self._datapackage.resources}
        self.__tabular_resources = {k: self._sanitize_resource(r)
                                    for (k, r) in self.__resources.items()
                                    if r.tabular and
                                    r.descriptor['path'].startswith('data')}

        self.__invalid_schemas = []  # Resource names with invalid schemas

        # All formats
        self.raw_data = LazyLoadedDict.from_keys(
            self.__resources.keys(),
            self._load_raw_data,
            'bytes')

        # Tabular formats
        self.tables = LazyLoadedDict.from_keys(
            self.__tabular_resources.keys(),
            self._load_table,
            type_hint='list of rows')

        self.dataframes = LazyLoadedDict.from_keys(
            self.__tabular_resources.keys(),
            self._load_dataframe,
            type_hint='pandas.DataFrame')
Beispiel #4
0
def load_dataframe(filename, resource):
    """Load one table from a datapackage."""
    package = datapackage.Package(filename)
    r = package.get_resource(resource)
    if r is None:
        raise KeyError('No resource found: %s' % resource)
    return pd.DataFrame(r.read(), columns=r.headers)
Beispiel #5
0
 def test_it(client, doi):
     jsr = client.remote_datapackage_json(doi)
     dp = datapackage.Package(jsr)
     if not dp.valid:
         raise AssertionError(
             f"Invalid datapackage.json found for {doi} "
             f"({jsr['name']}).")
Beispiel #6
0
def getdata_old(data_url='https://datahub.io/core/covid-19/datapackage.json',
                resourcename='countries-aggregated_csv'):
    """
    Get data from the web.
    datahub is outdated. Rewrote code to read directly from CSSE.

    Parameters:
    ===========

    data_url : string with url of the data from datahub
    
    resourcename : resource to use

    Output:
    =======

    pd.DataFrame with the epidemic statistics data

    """

    # to load Data Package into storage
    package = datapackage.Package(data_url)
    resources = package.resources

    for resource in resources:
        if resource.name == resourcename:
            url = resource.descriptor['path']
            print('Importing', url)
            s = requests.get(url).text
            data = pd.read_csv(StringIO(s))
    return data
Beispiel #7
0
def extract_airports_data(source, target):
    """
    Description: 
        This function is to extract airports codes data file in csv format and load into s3 data lakes in parquet format using pandas.  
    Arguments: 
        source: location for source json file
        target: location for output parquet file
    Returns: 
        None
    """
    print("INFO: Extracting and loading airports data")
    # to load Data Package into storage
    package = datapackage.Package(source)
    # to load only tabular data
    resources = package.resources
    for resource in resources:
        if resource.descriptor['datahub']['type'] == 'derived/csv':
            parquet_file_name = resource.name.split('_')[0] + ".parquet"
            parquet_file_path = os.path.join(target, parquet_file_name)
            fs = s3fs.S3FileSystem(anon=False)
            if fs.exists(parquet_file_path):
                print("INFO: {} already processed".format(parquet_file_name))
            else:
                df = pd.read_csv(resource.descriptor['path'])
                df.to_parquet(parquet_file_path)
Beispiel #8
0
def validate_save_pkg(pkg_descriptor, pkg_dir):
    """
    Validate a data package descriptor and save it to a json file.

    Args:
        pkg_descriptor (dict):
        pkg_dir (path-like):

    Returns:
        report

    """
    # Use that descriptor to instantiate a Package object
    data_pkg = datapackage.Package(pkg_descriptor)

    # Validate the data package descriptor before we go to
    if not data_pkg.valid:
        logger.error(f"""
            Invalid tabular data package: {data_pkg.descriptor["name"]}
            Errors: {data_pkg.errors}""")

    # pkg_json is the datapackage.json that we ultimately output:
    pkg_json = os.path.join(pkg_dir, "datapackage.json")
    data_pkg.save(pkg_json)
    logger.info('Validating the data package...')
    # Validate the data within the package using goodtables:
    report = goodtables.validate(pkg_json, row_limit=1000)
    if not report['valid']:
        logger.error("Data package validation failed.")
    else:
        logger.info('Congrats! You made a valid data package!')
    return report
    def prepare(self, stream, schema, extra):

        # Prepare package
        if 'datapackage' not in extra or 'resource-name' not in extra:
            return False
        descriptor = extra['datapackage']
        if descriptor.strip().startswith('{'):
            descriptor = json.loads(descriptor)
        self.__package = datapackage.Package(descriptor)

        # Prepare schema
        if not schema:
            return False
        if not schema.foreign_keys:
            return False
        self.__schema = schema

        # Prepare foreign keys values
        try:
            self.__relations = _get_relations(
                self.__package,
                self.__schema,
                current_resource_name=extra['resource-name'])
            self.__foreign_keys_values = _get_foreign_keys_values(
                self.__schema, self.__relations)
            self.__relations_exception = None
        except _ReferenceTableError as exception:
            self.__relations_exception = exception

        return True
Beispiel #10
0
def generate_metadata(pkg_settings, tables, pkg_dir, uuid_pkgs=uuid.uuid4()):
    # pkg_json is the datapackage.json that we ultimately output:
    pkg_json = os.path.join(pkg_dir, "datapackage.json")
    # Create a tabular data resource for each of the tables.
    resources = []
    for t in tables:
        resources.append(get_tabular_data_resource_2(t, pkg_dir=pkg_dir))

    data_sources = pudl.helpers.data_sources_from_tables_pkg(tables)
    sources = []
    for src in data_sources:
        if src in pudl.constants.data_sources:
            sources.append({"title": src, "path": pc.base_data_urls[src]})

    contributors = set()
    for src in data_sources:
        for c in pudl.constants.contributors_by_source[src]:
            contributors.add(c)

    pkg_descriptor = {
        "name":
        pkg_settings["name"],
        "profile":
        "tabular-data-package",
        "title":
        pkg_settings["title"],
        "id":
        uuid_pkgs,
        "description":
        pkg_settings["description"],
        # "keywords": pkg_settings["keywords"],
        "homepage":
        "https://catalyst.coop/pudl/",
        "created":
        (datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'),
        "contributors": [pudl.constants.contributors[c] for c in contributors],
        "sources":
        sources,
        "licenses": [pudl.constants.licenses["cc-by-4.0"]],
        "resources":
        resources,
    }

    # Use that descriptor to instantiate a Package object
    data_pkg = datapackage.Package(pkg_descriptor)

    # Validate the data package descriptor before we go to
    if not data_pkg.valid:
        logger.warning(f"""
            Invalid tabular data package: {data_pkg.descriptor["name"]}
            Errors: {data_pkg.errors}""")

    data_pkg.save(pkg_json)
    # Validate the data within the package using goodtables:
    report = goodtables.validate(pkg_json, row_limit=1000)
    if not report['valid']:
        logger.warning("Data package data validation failed.")

    return data_pkg, report
Beispiel #11
0
 def _validate_datapackage(self, datapackage_json: dict):
     """Checks the correctness of datapackage.json metadata. Throws ValueError if invalid."""
     dp = datapackage.Package(datapackage_json)
     if not dp.valid:
         msg = f"Found {len(dp.errors)} datapackage validation errors:\n"
         for e in dp.errors:
             msg = msg + f"  * {e}\n"
         raise ValueError(msg)
Beispiel #12
0
 def __new__(cls):
     data_url = 'https://datahub.io/core/s-and-p-500-companies-financials/datapackage.json'
     package = datapackage.Package(data_url)
     resources = package.resources
     for resource in resources:
         if resource.tabular:
             data = pd.read_csv(resource.descriptor['path'])
     return data
Beispiel #13
0
def load_datapackage_tables(filename):
    """Load all the tables from a datapackage."""
    package = datapackage.Package(filename)
    tables = {
        r.name: pd.DataFrame(r.read(), columns=r.headers)
        for r in package.resources
    }
    return {k: df.set_index(['year', 'product']) for k, df in tables.items()}
def get_dataset():
    data_url = 'https://datahub.io/machine-learning/iris/datapackage.json'
    package = datapackage.Package(data_url)
    resources = package.resources

    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])
            return data
Beispiel #15
0
def downloadData(url: str, descriptor: str):
    """
        Returns first corrence of provided descriptor from provided url as a file handler
    """
    assets = datapackage.Package(url).resources

    for data in filter(
            lambda x: x.tabular and x.descriptor['name'] == descriptor,
            assets):
        response = requests.get(data.descriptor['path'])
        return io.StringIO(response.content.decode('utf-8'))
Beispiel #16
0
def test_cli_init():
    resource_path = 'data/valid.csv'

    result = CliRunner().invoke(init, [resource_path])

    assert result.exit_code == 0

    dp = datapackage.Package(json.loads(result.output), strict=True)
    resource = dp.resources[0]
    assert resource.descriptor['path'] == resource_path
    assert 'schema' in resource.descriptor
    def get_s_and_p_symbols(self):

        package = dp.Package(self.url)

        # print list of all resources:
        print(package.resource_names)

        storage_dict = self.convert_to_dfs(package)

        df = storage_dict['constituents']

        return df['Symbol'].drop_duplicates()
Beispiel #18
0
def get(market):
    data_url = 'https://datahub.io/core/' + market + '/datapackage.json'
    # to load Data Package into storage
    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])

            return data
Beispiel #19
0
 def __load_data(self, csv_source):
     start = time.time()
     if 'http' in csv_source:
         import datapackage
         package = datapackage.Package(csv_source)
         resources = package.resources
         for resource in resources:
             if resource.tabular:
                 return pd.read_csv(resource.descriptor['path'])
     else:
         return pd.read_csv(csv_source)
     end = time.time()
     print("Loaded data in {} seconds".format(end - start))
Beispiel #20
0
def meta_omniglot(data_root_folder=None,
                  std_num_classes=None,
                  std_num_examples=None,
                  one_hot_enc=True,
                  rand=0,
                  n_splits=None):
    """
    Loads, and downloads if necessary, Omniglot meta-dataset
    """
    data_folder_name = 'omniglot_resized'

    if em is None:
        return experiment_manager_not_available('meta_omniglot NOT AVAILABLE!')

    if data_root_folder is None:
        data_root_folder = os.path.join(os.getcwd(), 'DATA')
        if not os.path.exists(data_root_folder):
            os.mkdir(data_root_folder)
    data_folder = os.path.join(data_root_folder, data_folder_name)

    if os.path.exists(data_folder):
        print('DATA FOLDER IS:', data_folder)
        print('LOADING META-DATASET')
        return em.load.meta_omniglot(data_folder,
                                     std_num_classes=std_num_classes,
                                     std_num_examples=std_num_examples,
                                     one_hot_enc=one_hot_enc,
                                     _rand=rand,
                                     n_splits=n_splits)
    else:
        print('DOWNLOADING DATA')

        package = datapackage.Package(
            'https://datahub.io/lucfra/omniglot_resized/datapackage.json')

        with open('tmp_omniglot_resized.zip', 'wb') as f:
            f.write(package.get_resource('omniglot_resized').raw_read())

        import zipfile
        zip_ref = zipfile.ZipFile('tmp_omniglot_resized.zip', 'r')
        print('EXTRACTING DATA')
        zip_ref.extractall(data_root_folder)
        zip_ref.close()

        os.remove('tmp_omniglot_resized.zip')

        print('DONE')

        # os.tmpfile()
        return meta_omniglot(data_root_folder, std_num_classes,
                             std_num_examples, one_hot_enc, rand, n_splits)
Beispiel #21
0
def extract_tickers():
    data_url = 'https://datahub.io/core/nasdaq-listings/datapackage.json'

    # to load Data Package into storage
    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    for resource in resources:
        if resource.tabular:
            NASDAQ = pd.read_csv(resource.descriptor['path'])
            
            
    NASDAQ.to_csv('../data/NASDAQ_Update.csv',index=False)
Beispiel #22
0
def validate_save_pkg(pkg_descriptor, pkg_dir):
    """
    Validate a data package descriptor and save it to a json file.

    Args:
        pkg_descriptor (dict):
        pkg_dir (path-like):

    Returns:
        report

    """
    # Use that descriptor to instantiate a Package object
    data_pkg = datapackage.Package(pkg_descriptor)

    # Validate the data package descriptor before we go to
    logger.info(
        f"Validating JSON descriptor for {data_pkg.descriptor['name']} "
        f"tabular data package...")
    if not data_pkg.valid:
        raise ValueError(
            f"Invalid tabular data package: {data_pkg.descriptor['name']} "
            f"Errors: {data_pkg.errors}")
    logger.info('JSON descriptor appears valid!')

    # pkg_json is the datapackage.json that we ultimately output:
    pkg_json = os.path.join(pkg_dir, "datapackage.json")
    data_pkg.save(pkg_json)
    logger.info(
        f"Validating a sample of data from {data_pkg.descriptor['name']} "
        f"tabular data package using goodtables...")
    # Validate the data within the package using goodtables:
    report = goodtables.validate(
        pkg_json,
        # TODO: check which checks are applied... and uncomment out the line
        # below when the checks are integrated
        # checks=['structure', 'schema', 'foreign-key'],
        row_limit=1000)
    if not report["valid"]:
        goodtables_errors = ""
        for table in report["tables"]:
            if not table["valid"]:
                goodtables_errors += str(table["source"])
                goodtables_errors += str(table["errors"])
        raise ValueError(
            f"Data package data validation failed with goodtables. "
            f"Errors: {goodtables_errors}")
    logger.info('Congrats! You made a valid data package!')
    return report
Beispiel #23
0
def read_gold_prices():
    data_url = 'https://datahub.io/core/gold-prices/datapackage.json'
    package = datapackage.Package(data_url)
    resources = package.resources
    data = None
    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])
            #print(data.head())

    date_field = pd.to_datetime(data['Date'].astype(str), format='%Y-%m')
    data['Date'] = date_field
    #print(data)
    for index, row in data.iterrows():
        GoldPrice.objects.create(date=row['Date'], price=row['Price'])
Beispiel #24
0
 def __init__(self, descriptor=None, title=None, **kwargs):
     """
     You should not use the constructor directly but use the infer_from_file static method.
     :param descriptor: An initial json data package descriptor. If none it will generate a blank
     :param title: if a title is given it will be set in the descriptor along with the required name.
     :param kwargs: kwargs that can de passed to the underlying frictionless datapackage.Package
     """
     descriptor = descriptor or {}
     if title:
         descriptor['title'] = title
         descriptor['name'] = slugify(title)
     self.package = datapackage.Package(descriptor, **kwargs)
     self.biosys_errors = []
     # set the dataset type to be generic.
     self.dataset_type = Dataset.TYPE_GENERIC
 def __load_data(self, csv_source):
     if 'http' in csv_source:
         import datapackage
         while 1:
             try:
                 package = datapackage.Package(csv_source)
                 resources = package.resources
                 for resource in resources:
                     if resource.tabular:
                         return pd.read_csv(resource.descriptor['path'])
             except:
                 print(
                     "Failed to load Data from {}. WIll reload. Tracebac: {}"
                     .format(csv_source, traceback.format_exc()))
     else:
         return pd.read_csv(csv_source)
def create_custom(base_fp, agreement_type="both"):
    # Set "both" agreement type
    if agreement_type == "both":
        agreement_type = ["plain", "explicit"]
    else:
        agreement_type = [agreement_type]
    # TODO: Add a plain/explicit question to the front of GUI and to this...
    # ... this will choose the questions to ask and format to print
    base_data = json.load(base_fp)
    # TODO: The code exists to parse the correct datapackage data from the decisions list
    #       But, the current Jinja templates just take directly from the datapackage
    #       Eventually we want to just use the decisions template.
    # Append Datapackages to base_data
    try:
        pkg = datapackage.Package(get_datapackage_path())
        for name in pkg.resource_names:
            log.debug("loading datapackage {0}".format(name))
            resource = pkg.get_resource(name)
            base_data[name] = resource.table.read(keyed=True)
    except datapackage.exceptions.CastError as _e:
        log.debug("Error while attempting to read datapackage resource")
        log.error(_e.errors)
        raise _e
    if "plain" in agreement_type:
        # Create plain
        res = render("templates/plain.j2", base_data)
        md_path = "outputs/plain_custom.md"
        compose_agreement(md_path, res)
        pdf_path = "outputs/plain_custom.pdf"
        pandoc_command = [
            "pandoc", "-V", "geometry:margin=1in", "-f", "markdown_github",
            "-t", "latex", "-o", pdf_path, md_path
        ]
        subprocess.check_call(pandoc_command)
    if "explicit" in agreement_type:
        # Create Explicit
        res = render("templates/explicit.j2", base_data)
        md_path = "outputs/explicit_custom.md"
        compose_agreement(md_path, res)
        pdf_path = "outputs/explicit_custom.pdf"
        pandoc_command = [
            "pandoc", "-V", "geometry:margin=1in", "-f", "markdown_github",
            "-t", "latex", "-o", pdf_path, md_path
        ]
        subprocess.check_call(pandoc_command)
    print("DONE")
Beispiel #27
0
def cases_per_capita(df, df3):
    data_url = "https://datahub.io/JohnSnowLabs/population-figures-by-country/datapackage.json"

    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    for resource in resources:
        if resource.tabular:
            df2 = pd.read_csv(resource.descriptor['path'])
    df3 = new_cases_modified(df)

    dct2 = {}
    list_of_countries_sorted = df3['Country/Region'].tolist()
    new_cases_per_sorted_country = df3.iloc[:, -1].tolist()

    dct = {}
    result_dict = {}

    for i in list_of_countries_sorted:
        value = df2.loc[df2['Country'] == str(i)]['Year_2016'].tolist()
        for j in value:
            dct[str(i)] = j

    for key, value in dct.items():
        if key not in list_of_countries_sorted:
            print(key)

    for (country, value) in zip(list_of_countries_sorted,
                                new_cases_per_sorted_country):
        dct2[str(country)] = value

    country_list = []
    cases_Capita = []

    for keys in dct.keys():
        country_list.append(keys)

    for (key1, value1), (key2, value2) in zip(dct.items(), dct2.items()):
        per_capita = value2 / value1
        cases_Capita.append(per_capita)

    for (i, j) in zip(country_list, cases_Capita):
        result_dict[i] = str(j)

    return result_dict
def objeto_del_gasto(config):
    CT = COLUMN_MAPPING
    CN = dict((k, v.replace(':', '-')) for k, v in CT.items())

    lookup = {}
    codes = datapackage.Package(
        os.path.join(os.path.dirname(__file__),
                     'objeto_del_gasto.datapackage.zip'))
    for resource in codes.resources:
        kind = resource.name
        lookup[kind] = {}
        for row in resource.iter(keyed=True):
            key = row[kind.upper().replace('Í', 'I')]
            value = row['DESCRIPCION']
            lookup[kind][key] = value

    def process(row):
        year = int(row['date-fiscal-year'])

        # Skip the LAST year of the dataset (currently 2016) it has split columns already
        if year < 2018:
            objeto = row[CN['ID_CONCEPTO']]
            if objeto:
                row[CN['ID_CAPITULO']] = objeto[0] + '000'
                row[CN['ID_CONCEPTO']] = objeto[:2] + '00'
                row[CN['DESC_CAPITULO']] = lookup['capitulo'].get(
                    row[CN['ID_CAPITULO']])
                row[CN['DESC_CONCEPTO']] = lookup['concepto'].get(
                    row[CN['ID_CONCEPTO']])

                nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3

            if objeto and len(objeto) >= 4:
                row[CN['ID_PARTIDA_GENERICA']] = objeto[:nb_generica_digits]

            row[CN['DESC_PARTIDA_GENERICA']] = lookup['partida_generica'].get(
                row.get(CN['ID_PARTIDA_GENERICA']))

            if year not in (2008, 2009, 2010):
                if objeto and len(objeto) >= 5:
                    row[CN['ID_PARTIDA_ESPECIFICA']] = objeto
                    row[CN['DESC_PARTIDA_ESPECIFICA']] = \
                        lookup['partida_específica'].get(row.get(CN['ID_PARTIDA_ESPECIFICA']))

    return process
def get_corr_assets(dataset):
    data_url = 'https://datahub.io/core/finance-vix/datapackage.json'

    # to load Data Package into storage
    package = datapackage.Package(data_url)

    # to load only tabular data
    resources = package.resources
    for resource in resources:
        if resource.tabular:
            data = pd.read_csv(resource.descriptor['path'])
            break
    data['Date'] = pd.to_datetime(data.Date, format='%Y-%m-%d')
    start = data[data['Date'] == dataset['Date'][0]].index[0]

    # Create VIX Close and Open
    dataset['vixClose'] = data['VIX Close'][start:].reset_index().drop('index', axis=1)
    dataset['vixOpen'] = data['VIX Open'][start:].reset_index().drop('index', axis=1)
    return dataset
Beispiel #30
0
def init_datapackage(resource_paths):
    """Create tabular data package with resources.

    It will also infer the tabular resources' schemas.

    Args:
        resource_paths (List[str]): Paths to the data package resources.

    Returns:
        datapackage.Package: The data package.
    """
    dp = datapackage.Package({
        'name': 'change-me',
        'schema': 'tabular-data-package',
    })

    for path in resource_paths:
        dp.infer(path)

    return dp