Ejemplo n.º 1
0
        def __init__(self, raw_args):
            self.cwd = getcwd()

            self.raw_args = raw_args

            self.args = parser.parse_args(self.raw_args[1:])

            self.cache = get_cache('metapack')

            # This one is for loading packages that have just been
            # written to S3.
            self.tmp_cache = get_cache('temp')
            clean_cache(self.tmp_cache)

            if self.args.all_s3:
                self.args.s3 = self.args.all_s3
                self.args.excel = True
                self.args.zip = True
                self.args.csv = True
                self.args.fs = True

            self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.args.fs = self.args.csv or self.args.fs
Ejemplo n.º 2
0
    def test_ipy(self):
        from rowgenerators import SourceSpec, Url, RowGenerator, get_cache

        from rowgenerators.fetch import download_and_cache

        urls = ('ipynb+file:foobar.ipynb',
                'ipynb+http://example.com/foobar.ipynb', 'ipynb:foobar.ipynb')

        for url in urls:
            u = Url(url)
            print(u, u.path, u.resource_url)

            s = SourceSpec(url)
            print(s, s.proto, s.scheme, s.resource_url, s.target_file,
                  s.target_format)
            self.assertIn(s.scheme, ('file', 'http'))
            self.assertEquals('ipynb', s.proto)

        gen = RowGenerator(cache=get_cache(),
                           url='ipynb:scripts/Py3Notebook.ipynb#lst',
                           working_dir=test_data(),
                           generator_args={'mult': lambda x: x * 3})

        rows = gen.generator.execute()

        print(len(rows))
Ejemplo n.º 3
0
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))
Ejemplo n.º 4
0
    def mt_materialize(self, line):
        """Materialize a single dataframe
        """

        from json import dumps
        from rowgenerators import get_cache
        from rowgenerators.generator.python import PandasDataframeSource
        from metapack.util import ensure_dir
        import numpy as np
        import csv
        from os.path import join

        args = parse_argstring(self.mt_materialize, line)

        dr = args.dir.strip("'")

        ensure_dir(dr)

        try:
            cache = self.mt_doc._cache
        except KeyError:
            cache = get_cache()

        path = join(dr, args.df_name + ".csv")
        try:
            df = fill_categorical_na(self.shell.user_ns[args.df_name].copy())
        except ValueError:
            # For categorical columns, the fill value must be one of the categories, and ''
            # probably isn't.
            pass

        if len(df.index.names) == 1 and df.index.names[
                0] is None and df.index.dtype != np.dtype('O'):
            # Simple index; ignore it.
            df.to_csv(path, index=False)
        else:
            # PandasDataFrameSource has some more complex handling of multi-level indices
            gen = PandasDataframeSource(parse_app_url(path), df, cache=cache)

            with open(path, 'w') as f:
                w = csv.writer(f)
                w.writerows(gen)

        print(dumps({
            'df_name': args.df_name,
            'path': path,
        }, indent=4))
Ejemplo n.º 5
0
def _cached_get(url, cache=True):
    """Return the results of a GET request, possibly cached.
    Assumes the response is JSON"""

    cache_fs = get_cache()

    cache_key = slugify(url)

    if cache and cache_fs.exists(cache_key):
        data = json.loads(cache_fs.gettext(cache_key))
    else:
        try:
            r = requests.get(url)
            data = r.json()
            r.raise_for_status()
        except:
            raise AccessException("ERROR " + r.text)

        if cache:
            cache_fs.settext(cache_key, json.dumps(data, indent=4))

    return data
Ejemplo n.º 6
0
def metatab():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metatab',
        description='Matatab file parser, version {}'.format(_meta.__version__))

    parser.add_argument('-C', '--clean-cache', default=False, action='store_true',
                        help="Clean the download cache")

    g = parser.add_mutually_exclusive_group(required=True)

    g.add_argument('-i', '--info', default=False, action='store_true',
                   help="Show configuration information")

    g.add_argument('-c', '--create', action='store', nargs='?', default=False,
                   help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ")

    g.add_argument('-t', '--terms', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, before interpretation')

    g.add_argument('-I', '--interp', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, after interpretation')

    g.add_argument('-j', '--json', default=False, action='store_true',
                   help='Parse a file and print out a JSON representation')

    g.add_argument('-y', '--yaml', default=False, action='store_true',
                   help='Parse a file and print out a YAML representation')

    g.add_argument('-R', '--resource', default=False, action='store_true',
                   help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV')

    g.add_argument('-H', '--head', default=False, action='store_true',
                   help="Dump the first 20 lines of a resoruce ")

    g.add_argument('-S', '--schema',
                   help='Dump the schema for one named resource')

    parser.add_argument('-d', '--show-declaration', default=False, action='store_true',
                        help='Parse a declaration file and print out declaration dict. Use -j or -y for the format')

    parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' +
                                                ' (Adds the declaration to the start of the file as the first term. )')

    parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file')

    args = parser.parse_args(sys.argv[1:])

    # Specing a fragment screws up setting the default metadata file name
    if args.file.startswith('#'):
        args.file = DEFAULT_METATAB_FILE + args.file

    cache = get_cache('metapack')

    if args.info:
        prt('Version  : {}'.format(_meta.__version__))
        prt('Cache dir: {}'.format(str(cache.getsyspath('/'))))
        exit(0)

    if args.clean_cache:
        clean_cache(cache)

    if args.create is not False:
        new_metatab_file(args.file, args.create)
        exit(0)

    if args.resource or args.head:

        limit = 20 if args.head else None

        u = Url(args.file)
        resource = u.parts.fragment
        metadata_url = u.rebuild_url(False, False)

        package_url, metadata_url = resolve_package_metadata_url(metadata_url)

        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return # Never reached

        if resource:
            dump_resource(doc, resource, limit)
        else:
            dump_resources(doc)


        exit(0)

    if args.show_declaration:

        doc = MetatabDoc()
        doc.load_declarations([args.file])

        print(json.dumps({
            'terms': doc.decl_terms,
            'sections': doc.decl_sections
        }, indent=4))
        exit(0)
    else:

        package_url, metadata_url = resolve_package_metadata_url(args.file)
        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except IOError as e:
            raise
            err("Failed to open '{}': {}".format(metadata_url, e))

    if args.terms:
        for t in doc._term_parser:
            print(t)

    elif args.json:
        print(json.dumps(doc.as_dict(), indent=4))


    elif args.yaml:
        import yaml
        print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4))


    elif args.schema:
        dump_schema(doc, args.schema)

    exit(0)
Ejemplo n.º 7
0
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            self.set_mt_arg(args.metatabfile)