def main():
    print('running etl...')
    data = pd.read_excel(open_google_spreadsheet(DOCID), sheet_name=SHEET)

    measures = list()

    for c, df in gen_datapoints(data):
        c_id = COLUMN_TO_CONCEPT[c]
        df.columns = [c_id]
        serve_datapoint(df, OUT_DIR, c_id)

        measures.append((c_id, c))

    measures_df = pd.DataFrame(measures, columns=['concept', 'name'])
    measures_df['concept_type'] = 'measure'

    dimensions_df = pd.DataFrame.from_dict(
        dict(concept=DIMENSIONS,
             name=list(map(str.title, DIMENSIONS)),
             concept_type=['entity_domain', 'time']))
    others_df = pd.DataFrame.from_dict(
        dict(concept=['name'], name=['name'], concept_type=['string']))
    (pd.concat([measures_df, dimensions_df, others_df],
               ignore_index=True).to_csv(osp.join(OUT_DIR,
                                                  'ddf--concepts.csv'),
                                         index=False))

    geo_df = create_geo_domain(data)
    geo_df.to_csv(osp.join(OUT_DIR, 'ddf--entities--geo.csv'), index=False)

    # datapackage
    dump_json(osp.join(OUT_DIR, 'datapackage.json'),
              get_datapackage(OUT_DIR, update=True))
Exemple #2
0
    def to_package(self, dirname, attrs={}):
        """ Save data to a DDF package.

        Parameters
        ----------
        dirname : str
            Name of the DDF directory to be created.
        attrs : dict
            Attributes to add/update in datapackage.json.
        """

        cwd = os.getcwd()
        dirpath = os.path.join(cwd, dirname)

        if os.path.exists(dirpath):
            shutil.rmtree(dirpath)

        os.mkdir(dirpath)

        # Create data files
        for dataset in self.data:
            for table in dataset.tables:
                path = os.path.join(dirpath, table[0])
                table[1].to_csv(path, index=False)

        # Create entity files
        for entity_name, entity_items in self.entities.items():
            path = os.path.join(dirpath, f'ddf--entities--{entity_name}.csv')
            entity_items.to_csv(path, index=False)

        # Create concepts file
        path = os.path.join(dirpath, 'ddf--concepts.csv')
        concepts = pd.DataFrame([x.data for x in self.concepts])
        concepts = concepts.drop_duplicates(subset=['concept'])
        concepts.to_csv(path, index=False)

        # Create datapackage.json
        meta = package.create_datapackage(dirpath, **attrs)
        dump_json(os.path.join(dirpath, 'datapackage.json'), meta)
Exemple #3
0
def run_recipe(recipe, outdir, ddf_dir, update, dry_run, gen_dp, show_tree):
    """generate new ddf dataset with recipe"""
    from ddf_utils.chef.api import Chef
    from ddf_utils.package import create_datapackage
    from ddf_utils.io import dump_json
    import json

    coloredlogs.install(logger=logging.getLogger('Chef'),
                        fmt='%(asctime)s %(name)s %(levelname)s %(message)s',
                        level=LOG_LEVEL)

    click.echo('building recipe...')
    if ddf_dir:
        chef = Chef.from_recipe(recipe, ddf_dir=ddf_dir)
    else:
        chef = Chef.from_recipe(recipe)
    if show_tree:
        chef.dag.tree_view()
        return
    if update:
        pass
    serve = not dry_run
    chef.run(serve=serve, outpath=outdir)
    if serve and gen_dp:
        click.echo('creating datapackage file...')
        datapackage_path = os.path.join(outdir, 'datapackage.json')
        if os.path.exists(datapackage_path):
            click.echo('backup old datapackage.json to datapackage.json.bak')
            shutil.copyfile(datapackage_path, os.path.join(outdir, 'datapackage.json.bak'))
            dp_old = json.load(open(datapackage_path))
            # copy translations info. other info should be in the recipe.
            if 'translations' in dp_old.keys():
                chef = chef.add_metadata(translations=dp_old['translations'])
        dump_json(os.path.join(outdir, 'datapackage.json'),
                  create_datapackage(outdir, gen_schema=True, **chef.metadata))
    click.echo("Done.")
Exemple #4
0
 def _gen_dp(d):
     dp = get_datapackage(d, update=True)
     dump_json(osp.join(dataset, 'datapackage.json'), dp)
    if series.dtype in numeric_dtypes:
        return 'measure'
    if series.dtype == 'bool':
        return 'boolean'
    return 'string'


if __name__ == '__main__':
    df = pd.read_csv(source_path)

    # country entity
    country = df[['CountryCode', 'CountryName']]
    country = ddf_table(country, key=['country'], renames=renames, id_concepts=id_concepts)

    # datapoints
    indicator_cols = filter(lambda col: col not in ['CountryName'], df.columns)
    data = df[indicator_cols]
    data = ddf_table(data, key=['country','day'], renames=renames, id_concepts=id_concepts)

    # concepts
    concepts = get_concepts([country, data])
    ddf_table(concepts, key=['concept'])

    # datapackage
    dp = get_datapackage(output_dir, update=True)
    dp['source'] = {}
    with open(sha_path, 'r') as f:
        dp['source']['sha'] = f.readline()
    dp_path = osp.join(output_dir, 'datapackage.json')
    dump_json(dp_path, dp)
Exemple #6
0
    imported.append('Geo')
    imported.append('Geo Name')
    imported.append('Name')
    imported.append('Year')

    imported_dict = dict([[k, concept_dict[k]] for k in imported])

    concepts_df['concept'] = imported_dict.values()
    concepts_df['name'] = [x.strip() for x in imported_dict.keys()]
    concepts_df['concept_type'] = 'measure'

    concepts_df = concepts_df.set_index('concept')
    concepts_df.loc['geo', 'concept_type'] = 'entity_domain'
    concepts_df.loc['name', 'concept_type'] = 'string'
    concepts_df.loc['year', 'concept_type'] = 'time'
    concepts_df.loc['geo_name', 'concept_type'] = 'string'

    fn_concept = os.path.join(out_dir, 'ddf--concepts.csv')
    concepts_df.sort_values(by=['concept_type', 'name']).to_csv(fn_concept)

    # datapackage
    dp = get_datapackage(out_dir, use_existing=True, update=True)
    dump_json(os.path.join(out_dir, 'datapackage.json'), dp)

    print('tabs not imported:')
    for i in not_imported:
        print(i)
    print('If there are tabs should be imported here, please modify the script.')
    print('Done.')
cleanup(outputFolder)

concepts = pd.DataFrame()

# species entity domain
inputFile = os.path.join(inputFolder, 'bulk', 'taxonomy.csv')
outputFile = os.path.join(outputFolder, 'ddf--entities--species.csv')
df = processEntityDomain(inputFile, conceptDict={ 'internalTaxonId': 'species' })
df.to_csv(outputFile, index=False, encoding='utf8')
concepts = extractConcepts(df, concepts)

# assessment entity domain
inputFile = os.path.join(inputFolder, 'bulk', 'assessments.csv')
outputFile = os.path.join(outputFolder, 'ddf--entities--assessment.csv')
df = processEntityDomain(inputFile, 
    conceptDict = { 'internalTaxonId': 'species', 'assessmentId': 'assessment'},
    dropCols = ['scientificName']
)
# df = filterSpecies(df)
df.to_csv(outputFile, index=False, encoding='utf8')
concepts = extractConcepts(df, concepts)

# concepts
outputFile = os.path.join(outputFolder, 'ddf--concepts.csv')
concepts = concepts.rename_axis('concept')
concepts.to_csv(outputFile, index=True, encoding='utf8')

# datapackage
dump_json(os.path.join(outputFolder, 'datapackage.json'), get_datapackage(outputFolder, update=True))