def drop_sql(args, doc, r): try: st = r.schema_term except AttributeError: return ''; if not st: err(f"Resource '{r.name}' does not have a schema term") table_name = mk_table_name(r, doc) table = Table(table_name, MetaData(bind=None)) dialect = dialect_map.get(args.dialect,mysql.dialect()) lines = str(DropTable(table).compile(dialect=dialect)).\ replace('DROP TABLE', 'DROP TABLE IF EXISTS') out = [] for l in lines.splitlines(): if l.strip(): out.append(l+';') return '\n'.join(out)
def convert_hugo(nb_path, hugo_path): from os import environ from os.path import abspath # Total hack. Would like the -H to be allowed to have no arg, and then use the env var, # but I don't know how to do that. This is the case where the user types # -H nb_path, so just go with it. if hugo_path and not nb_path: nb_path = hugo_path hugo_path = environ.get('METAPACK_HUGO_DIR') if not hugo_path: err("Must specify value for -H or the METAPACK_HUGO_DIR environment var" ) if not exists(nb_path): err("Notebook path does not exist: '{}' ".format(nb_path)) c = Config() c.HugoExporter.hugo_dir = abspath( hugo_path) # Exports assume rel path is rel to notebook he = HugoExporter(config=c, log=logger) output, resources = he.from_filename(nb_path) prt('Writing Notebook to Hugo Markdown') prt(' Writing ', resources['unique_key'] + resources['output_extension']) for k, v in resources['outputs'].items(): prt(' Writing ', k) fw = FilesWriter() fw.write(output, resources, notebook_name=resources['unique_key'])
def write_eda_notebook(m): # Get the EDA notebook file from Github url = "https://raw.githubusercontent.com/Metatab/exploratory-data-analysis/master/eda.ipynb" resource = m.get_resource() if not resource: warn('Must specify a resource. Select one of:') list_rr(m.doc) sys.exit(0) r = requests.get(url, allow_redirects=True) r.raise_for_status() nb_path = Path('notebooks/{}-{}.ipynb'.format( splitext(basename(url))[0], resource.name)) ensure_dir(nb_path.parent) if nb_path.exists(): err("Notebook {} already exists".format(nb_path)) with nb_path.open('wb') as f: f.write(r.content) prt('Wrote {}'.format(nb_path)) with edit_notebook(nb_path) as nb: set_cell_source(nb, 'resource_name', "resource_name='{}'".format(resource.name))
def create_sql(args, doc, r): try: st = r.schema_term except AttributeError: return '' if not st: err(f"Resource '{r.name}' does not have a schema term") table_name = mk_table_name(r, doc) table = Table(table_name, MetaData(bind=None)) comment_rows = [] for col in r.columns(): # print(col) sql_type = type_map.get(col['datatype'], Text) table.append_column(Column(col['header'], sql_type, comment=col.get('description'))) comment_rows.append((col['header'], sql_type.__name__, col['description'])) dialect = dialect_map.get(args.dialect,mysql.dialect()) comment=dedent(f""" Table: {table_name} Description: {r.description} Dataset: {r.doc.name} Columns: {tabulate(comment_rows, tablefmt='simple')}""") return textwrap.indent(comment,'-- ')+'\n'+str(CreateTable(table).compile(dialect=dialect)).strip()+';'
def get_config(): config = _get_config() if config is None: err("No metatab configuration found. Can't get Github credentials. Maybe create '~/.metapack.yaml'" ) if not config.get('github', {}).get('token'): err('No token set in config file at github.token') return config
def extract_metatab(nb_path): if not exists(nb_path): err("Notebook path does not exist: '{}' ".format(nb_path)) c = Config() with open(nb_path) as f: nb = nbformat.reads(f.read(), as_version=4) return ExtractInlineMetatabDoc(package_url="metapack+file:" + dirname(nb_path)).run(nb)
def convert_notebook(nb_path): prt('Convert notebook to Metatab source package') if not exists(nb_path): err("Notebook path does not exist: '{}' ".format(nb_path)) c = Config() pe = NotebookExecutor(config=c, log=logger) prt('Running the notebook') output, resources = pe.from_filename(nb_path) fw = FilesWriter() fw.build_directory = pe.output_dir fw.write(output, resources, notebook_name=DEFAULT_METATAB_FILE) de = DocumentationExporter(config=c, log=logger, metadata=doc_metadata(pe.doc)) prt('Exporting documentation') output, resources = de.from_filename(nb_path) fw.build_directory = join(pe.output_dir, 'docs') fw.write(output, resources, notebook_name='notebook') new_mt_file = join(pe.output_dir, DEFAULT_METATAB_FILE) doc = MetapackDoc(new_mt_file) de.update_metatab(doc, resources) for lib_dir in pe.lib_dirs: lib_dir = normpath(lib_dir).lstrip('./') doc['Resources'].new_term("Root.PythonLib", lib_dir) path = abspath(lib_dir) dest = join(pe.output_dir, lib_dir) ensure_dir(dest) copytree(path, join(pe.output_dir, lib_dir)) doc.write_csv() # Reset the input to use the new data prt('Running with new package file: {}'.format(new_mt_file))
def run_colmap_new(args): m = MetapackCliMemo(args, downloader) resources = get_resources(m) if not resources: err(f"No resources found with colmap name '{m.args.colmap_name}'") # Collect all of the headers, into a list of headers, # and the union of all of them in col_index col_index = [] headers = [] for r in resources: h = r.headers col_index += [ alt_col_name(c) for c in h if alt_col_name(c) not in col_index ] headers.append(h) # Create lists, of the same length as the index, of the source # column names, at the same position as the alt_col_name is in the col_index data = [col_index] for header in headers: new_row = [None] * len(col_index) for c in header: new_row[col_index.index(alt_col_name(c))] = c data.append(new_row) t = [['index'] + [r.name for r in resources]] + list( zip(*data)) # zip transposes rows into columns. path = Path(f"colmap-{m.args.colmap_name}.csv") if m.args.print: from tabulate import tabulate prt(tabulate(t[1:], headers=t[0])) else: if path.exists() and not m.args.force: err(f"Col map file '{str(path)}' already exists. Use -f to overwrite" ) else: with path.open('w') as f: csv.writer(f).writerows(t) prt(f"Wrote {str(path)}")
def run_url_scrape(args): m = MetapackCliMemo(args, downloader) from metapack.util import scrape_urls_from_web_page doc = m.doc url = m.args.url doc['resources'].new_term('DownloadPage', url) d = scrape_urls_from_web_page(url) if d.get('error'): err(d.get('error')) new_resources = 0 new_documentation = 0 if not args.no_resources: for k, v in d['sources'].items(): u = parse_app_url(v['url']) t = doc['Resources'].new_term('DataFile', v['url'], name=u.fspath.stem, description=v.get('description')) new_resources += 1 if args.verbose: prt(t, t.props) if not args.no_docs: for k, v in d['external_documentation'].items(): term_name = classify_url(v['url']) u = parse_app_url(v['url']) t = doc['Documentation'].new_term(term_name, v['url'], name=u.fspath.stem, description=v.get('description')) new_documentation += 1 if args.verbose: prt(t, t.props) prt("Added {} resource and {} documentation terms".format( new_resources, new_documentation)) if not args.dry_run: write_doc(doc)
def __init__(self, args, downloader): super().__init__(args, downloader) self.term = self.args.term[0] self.value = self.args.value[0] if hasattr(self.args, 'value') else None parts = self.term.split('.') if len(parts) != 2 and len(parts) != 3: err('Term arg must have 2 or 3 words seperated by a period') if len(parts) == 3: self.section, parts = parts[0], parts[1:] self.term = '.'.join(parts) else: self.section = 'Root'
def get_col_map(r): return r.header_map cm_name = r.get_value('colmap') if not cm_name: err(f"Resource '{r.name}' does not have a ColMap property") path = Path(f"colmap-{cm_name}.csv") if not path.exists(): err(f"Colmap file '{str(path)}' does nto exist") with path.open() as f: cm = {} for row in csv.DictReader(f): if row[r.name]: cm[row[r.name]] = row['index'] return cm
def write_notebook(m): # Get the EDA notebook file from Github url = "https://raw.githubusercontent.com/Metatab/notebook-templates/master/package-notebook.ipynb" r = requests.get(url, allow_redirects=True) r.raise_for_status() p = Path(m.args.new_notebook) nb_path = f'notebooks/{p.stem}.ipynb' ensure_dir(dirname(nb_path)) if exists(nb_path): err("Notebook {} already exists".format(nb_path)) with open(nb_path, 'wb') as f: f.write(r.content) prt('Wrote {}'.format(nb_path))
def load_sql(args, doc,r): try: st = r.schema_term except AttributeError: return '' if not st: err(f"Resource '{r.name}' does not have a schema term") table_name = mk_table_name(r, doc) if args.dialect == 'redshift': if args.access_key and args.secret: access_key, secret = args.access_key, args.secret elif args.s3profile: access_key, secret = get_credentials(args.s3profile) else: err('For redshift loading, must specify --access_key and --secret or --profile') if r.get_value('s3url'): cred = f"ACCESS_KEY_ID '{access_key}' SECRET_ACCESS_KEY '{secret}' ;" return f"""COPY {table_name} FROM '{r.s3url}' CSV {cred}""" elif args.dialect == 'postgresql': if args.load_prog: url = r.url return f"""COPY {table_name} FROM PROGRAM '{ args.load_prog} "{url}"' WITH CSV HEADER ENCODING 'utf8'; """ elif args.dialect == 'sqlite': u = r.resolved_url.get_resource().get_target() return f".mode csv {table_name}\n.import '{str(u.fspath)}' {table_name}" return ''
def convert_metatab_notebook(m): m.doc['Documentation'].get_or_new_term('Root.Readme').value = get_readme(m) return source = None # Path(source) if source.suffix == '.csv': dest = source.with_suffix('.ipynb') doc = MetapackDoc(source) doc.ensure_identifier() doc.update_name(create_term=True) # _write_metatab_notebook(doc, dest) elif source.suffix == '.ipynb': dest = source.with_suffix('.csv') doc = None # extract_notebook_metatab(source) doc.ensure_identifier() doc.update_name(create_term=True) write_doc(doc, dest) else: err("Source file must be either .ipynb or .csv")
def _build_cmd(args): from rowgenerators.rowpipe.exceptions import TooManyCastingErrors downloader.set_callback((build_downloader_callback)) m = MetapackCliMemo(args, downloader) if m.args.profile: from metatab.s3 import set_s3_profile set_s3_profile(m.args.profile) if m.args.clean_cache: clean_cache('metapack') try: changes = metatab_derived_handler(m) prt(f"{changes} changes") except TooManyCastingErrors as e: prt('Casting Errors:') for error in e.errors: prt(error) if m.args.exceptions: raise e else: err(e) except Exception as e: raise if m.args.exceptions: raise e else: err(e) clean_cache(m.cache) return changes
def update_custom(m): try: r = m.doc.get_lib_module_dict() r['custom_update'](m.doc, m.args.remainder) except ImportError: err('No custom function')
def new_cmd(args): from metapack import MetapackDoc from metapack.util import make_metatab_file, datetime_now, ensure_dir from metapack.cli.core import write_doc, prt, err from os.path import exists, join, expanduser from metatab import DEFAULT_METATAB_FILE from os import getenv if args.config: config_file = args.config elif getenv("METAPACK_CONFIG"): config_file = getenv("METAPACK_DEFAULTS") elif expanduser('~/.metapack-default.csv'): config_file = expanduser('~/.metapack-defaults.csv') else: config_file = None if config_file and exists(config_file): prt(f"Using defaults file '{config_file}'") config = MetapackDoc(config_file) else: config = MetapackDoc() if args.jupyter: import tempfile with tempfile.NamedTemporaryFile(suffix='.ipynb', delete=False) as fp: r = requests.get(TEMPLATE_NOTEBOOK, allow_redirects=True) r.raise_for_status() fp.write(r.content) nb_path = Path(fp.name) doc = MetapackDoc(str(nb_path)) else: doc = make_metatab_file(args.template) doc['Root']['Created'] = datetime_now() origin = args.origin or config.get_value('Root.Origin') if not origin: err("Must specify a value for origin, either on command line or in defaults file" ) (doc['Root'].find_first('Root.Origin') or et).value = origin (doc['Root'].find_first('Root.Dataset') or et).value = args.dataset (doc['Root'].find_first('Root.Space') or et).value = args.space or config.get_value('Root.Space') (doc['Root'].find_first('Root.Time') or et).value = args.time or config.get_value('Root.Time') (doc['Root'].find_first('Root.Grain') or et).value = args.grain or config.get_value('Root.Grain') (doc['Root'].find_first('Root.Variant') or et).value = args.variant or config.get_value('Root.Variant') v = doc['Root'].get_or_new_term('Root.Version') v.get_or_new_child( 'Version.Major' ).value = args.revision or config.get_value('Root.Version') v.get_or_new_child('Version.Minor').value = 1 v.get_or_new_child('Version.Patch').value = 1 # Copy contacts in if 'Contacts' in config: for c in config['Contacts']: doc['Contacts'].add_term(c) if args.title: doc['Root'].find_first('Root.Title').value = args.title.strip() nv_name = doc.as_version(None) if args.example: doc['Resources'].new_term( 'Root.Datafile', 'http://public.source.civicknowledge.com/example.com/sources/random-names.csv', name='random_names') doc['Documentation'].new_term('Root.Homepage', 'http://metatab.org', title='Metatab Home Page') doc.ensure_identifier() doc.update_name(create_term=True) if getattr(args, 'jupyter'): # b/c maybe metatab_jupyter is not installed from metapack_jupyter.convert import write_metatab_notebook from metapack_jupyter.core import edit_notebook, set_cell_source, get_cell_source new_nb_path = Path(f'{nv_name}.ipynb') doc['Resources'].new_term( 'Root.Datafile', './' + str(new_nb_path) + "#df", name='local_dataframe', description='Example of using a local Dataframe') if new_nb_path.exists(): err(f"Directory {nb_path} already exists") copyfile(nb_path, new_nb_path) write_metatab_notebook(doc, new_nb_path) with edit_notebook(new_nb_path) as nb: init = get_cell_source(nb, 'init') init += f"\nthis_package_name = '{str(new_nb_path.name)}'" set_cell_source(nb, 'init', init) nb_path.unlink() else: doc['Documentation'].new_term('Root.Documentation', 'file:README.md', title='README') if exists(nv_name): err(f"Directory {nv_name} already exists") if args.csv: fn = doc.nonver_name + '.csv' write_doc(doc, fn) prt(f"Writing to {fn}") else: ensure_dir(nv_name) pylib_dir = join(nv_name, 'pylib') ensure_dir(pylib_dir) with open(join(pylib_dir, '__init__.py'), 'w') as f_out, open(pylib.__file__) as f_in: f_out.write(f_in.read()) if args.example: doc['Resources'].new_term('Root.Datafile', 'python:pylib#row_generator', name='row_generator') prt(f"Writing to '{nv_name}'") write_doc(doc, join(nv_name, DEFAULT_METATAB_FILE)) add_missing_files(nv_name) if args.title: readme = '# {}\n'.format(args.title) else: readme = '# {}\n'.format(doc.get_value('Root.Name')) with open(join(nv_name, 'README.md'), 'w') as f: f.write(readme)
def upload_packages(m): """""" dist_urls = [] fs_p = None files_processed = [] # For each package in _packages with the same name as this document... for ptype, purl, cache_path in generate_packages(m): au = m.bucket.access_url(cache_path) # Just copy the Excel and Zip files directly to S3 if ptype in ('xlsx', 'zip'): with open(purl.path, mode='rb') as f: access_url = m.bucket.write(f.read(), basename(purl.path), m.acl) if m.bucket.last_reason: files_processed.append([ *m.bucket.last_reason, access_url, '/'.join(purl.path.split(os.sep)[-2:]) ]) prt("Added {} distribution: {} ".format(ptype, au)) dist_urls.append(au) elif ptype == 'fs': # Write all of the FS package files to S3 try: s3_package_root = MetapackPackageUrl(str(m.s3_url), downloader=m.downloader) # fake-out: it's not actually an S3 CSV package; it's a FS package on S3. fs_p = S3CsvPackageBuilder(purl.metadata_url, s3_package_root, callback=prt, env={}, acl='public-read') url = fs_p.save() prt("Packaged saved to: {}".format(url)) # fs_url = MetapackUrl(url, downloader=purl.metadata_url.downloader) except NoCredentialsError: print(getenv('AWS_SECRET_ACCESS_KEY')) err("Failed to find boto credentials for S3. " "See http://boto3.readthedocs.io/en/latest/guide/configuration.html " ) # A crappy hack. make_s3_package should return the correct url if fs_p: if m.acl == 'private': au = fs_p.private_access_url.inner else: au = fs_p.public_access_url.inner dist_urls.append(au) if fs_p: fs_p.files_processed += files_processed # Ugly encapsulating-breaking hack. return dist_urls, fs_p