Beispiel #1
0
        def __init__(self, raw_args):
            self.cwd = getcwd()

            self.raw_args = raw_args

            self.args = parser.parse_args(self.raw_args[1:])

            self.cache = get_cache('metapack')

            # This one is for loading packages that have just been
            # written to S3.
            self.tmp_cache = get_cache('temp')
            clean_cache(self.tmp_cache)

            if self.args.all_s3:
                self.args.s3 = self.args.all_s3
                self.args.excel = True
                self.args.zip = True
                self.args.csv = True
                self.args.fs = True

            self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.args.fs = self.args.csv or self.args.fs
Beispiel #2
0
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))
Beispiel #3
0
def get_resource_urls(doc):

    resources = {}

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            prt("Skipping ZIP package ", package_url)

        elif u.resource_format == 'xlsx':
            resources[basename(package_url)] = package_url
            prt("Adding XLS package ", package_url)
            pass

        elif u.resource_format == 'csv':

            resources[basename(package_url)] = package_url

            prt("Adding CSV package {}".format(basename(package_url)))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                # '.csv': Data>world currently get the format from the name, not the URL
                resources[r.name + '.csv'] = r.resolved_url
                prt("Adding CSV resource {}".format(r.name))
        else:
            prt('Skipping {}'.format(package_url))

    return resources
Beispiel #4
0
    def test_metapack(self):

        from metatab import open_package, resolve_package_metadata_url

        cache = cache_fs()

        url = 'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names'

        rg = RowGenerator(cache=cache, url=url)

        package_url, metadata_url = resolve_package_metadata_url(
            rg.generator.spec.resource_url)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/',
            package_url)
        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/metadata.csv',
            metadata_url)

        doc = open_package(rg.generator.spec.resource_url, cache=cache)

        self.assertEquals(
            'http://library.metatab.org/example.com-simple_example-2017-us-1/data/random-names.csv',
            doc.resource('random-names').resolved_url)

        urls = [
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.zip#random-names',
            'metatab+http://library.metatab.org/example.com-simple_example-2017-us-1.xlsx#random-names'
        ]

        for url in urls:
            gen = None
            try:
                gen = RowGenerator(cache=cache, url=url)

                rows = list(gen)

                self.assertEquals(101, len(rows))
            except:
                print("ERROR URL", url)
                print("Row Generator ", gen)
                raise
Beispiel #5
0
        def set_mt_arg(self, metatabfile):

            self.mtfile_arg = metatabfile if metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.api_key = self.args.api or getenv('METAKAN_API_KEY')

            self.ckan_url = self.args.ckan or getenv('METAKAN_CKAN_URL')

            if not self.ckan_url:
                err("Set the --ckan option or the METAKAN_CKAN_URL env var to set the URL of a ckan instance"
                    )

            if not self.api_key:
                err("Set the --api option METAKAN_API_KEY env var  with the API key to a CKAN instance"
                    )
Beispiel #6
0
def metatab():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metatab',
        description='Matatab file parser, version {}'.format(_meta.__version__))

    parser.add_argument('-C', '--clean-cache', default=False, action='store_true',
                        help="Clean the download cache")

    g = parser.add_mutually_exclusive_group(required=True)

    g.add_argument('-i', '--info', default=False, action='store_true',
                   help="Show configuration information")

    g.add_argument('-c', '--create', action='store', nargs='?', default=False,
                   help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ")

    g.add_argument('-t', '--terms', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, before interpretation')

    g.add_argument('-I', '--interp', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, after interpretation')

    g.add_argument('-j', '--json', default=False, action='store_true',
                   help='Parse a file and print out a JSON representation')

    g.add_argument('-y', '--yaml', default=False, action='store_true',
                   help='Parse a file and print out a YAML representation')

    g.add_argument('-R', '--resource', default=False, action='store_true',
                   help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV')

    g.add_argument('-H', '--head', default=False, action='store_true',
                   help="Dump the first 20 lines of a resoruce ")

    g.add_argument('-S', '--schema',
                   help='Dump the schema for one named resource')

    parser.add_argument('-d', '--show-declaration', default=False, action='store_true',
                        help='Parse a declaration file and print out declaration dict. Use -j or -y for the format')

    parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' +
                                                ' (Adds the declaration to the start of the file as the first term. )')

    parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file')

    args = parser.parse_args(sys.argv[1:])

    # Specing a fragment screws up setting the default metadata file name
    if args.file.startswith('#'):
        args.file = DEFAULT_METATAB_FILE + args.file

    cache = get_cache('metapack')

    if args.info:
        prt('Version  : {}'.format(_meta.__version__))
        prt('Cache dir: {}'.format(str(cache.getsyspath('/'))))
        exit(0)

    if args.clean_cache:
        clean_cache(cache)

    if args.create is not False:
        new_metatab_file(args.file, args.create)
        exit(0)

    if args.resource or args.head:

        limit = 20 if args.head else None

        u = Url(args.file)
        resource = u.parts.fragment
        metadata_url = u.rebuild_url(False, False)

        package_url, metadata_url = resolve_package_metadata_url(metadata_url)

        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return # Never reached

        if resource:
            dump_resource(doc, resource, limit)
        else:
            dump_resources(doc)


        exit(0)

    if args.show_declaration:

        doc = MetatabDoc()
        doc.load_declarations([args.file])

        print(json.dumps({
            'terms': doc.decl_terms,
            'sections': doc.decl_sections
        }, indent=4))
        exit(0)
    else:

        package_url, metadata_url = resolve_package_metadata_url(args.file)
        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except IOError as e:
            raise
            err("Failed to open '{}': {}".format(metadata_url, e))

    if args.terms:
        for t in doc._term_parser:
            print(t)

    elif args.json:
        print(json.dumps(doc.as_dict(), indent=4))


    elif args.yaml:
        import yaml
        print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4))


    elif args.schema:
        dump_schema(doc, args.schema)

    exit(0)
Beispiel #7
0
def send_to_ckan(m):

    from ckanapi import RemoteCKAN, NotFound
    try:
        doc = MetatabDoc(m.mt_file, cache=m.cache)
    except (IOError, MetatabError) as e:
        err("Failed to open metatab '{}': {}".format(m.mt_file, e))

    c = RemoteCKAN(m.ckan_url, apikey=m.api_key)

    ckanid = doc.find_first_value('Root.Ckanid')
    identifier = doc.find_first_value('Root.Identitfier')
    name = doc.find_first('Root.Name')

    ckan_name = name.value.replace('.', '-')

    id_name = ckanid or ckan_name

    try:
        pkg = c.action.package_show(name_or_id=id_name)
        prt("Updating CKAN dataset for '{}'".format(ckan_name))
    except NotFound:
        pkg = c.action.package_create(name=ckan_name, package_id=identifier)
        prt("Adding CKAN dataset for '{}'".format(ckan_name))

    pkg['title'] = doc.find_first_value('Root.Title')

    if not pkg['title']:
        pkg['title'] = doc.find_first_value('Root.Description')

    try:
        pkg['notes'] = doc.markdown  #doc.find_first_value('Root.Description')
    except OSError as e:
        warn(e)

    pkg['version'] = name.properties.get('version')

    pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')]

    pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')]

    def get_org(name):

        if not name:
            return None

        try:
            return
        except NotFound:
            return None

    org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg'))

    if org_name:
        org_name_slug = org_name.value.replace('.', '-')
        try:

            owner_org = c.action.organization_show(id=org_name_slug).get('id')
            pkg['owner_org'] = owner_org
        except NotFound:
            warn("Didn't find org for '{}'; not setting organization ".format(
                org_name_slug))
            org_name_slug = None
    else:
        org_name_slug = None

    extras = {}

    for t in doc.find('*.*', section='Root'):
        if not t.term_is('Root.Distribution'):
            extras[t.qualified_term] = t.value

    for t in name.children:
        extras[t.qualified_term] = t.value

    pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()]

    resources = []

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='ZIP',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='ZIP version of package')
            resources.append(d)
            prt("Adding ZIP package ", d['name'])

        elif u.resource_format == 'xlsx':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='XLSX',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='Excel version of package')
            resources.append(d)
            prt("Adding XLS package ", d['name'])

        elif u.resource_format == 'csv':

            d = dict(url=package_url,
                     name=basename(package_url),
                     format='csv',
                     mimetype=mimetypes.guess_type(metadata_url)[0],
                     description='CSV Package Metadata in Metatab format')

            resources.append(d)
            prt("Adding {} package {}".format(d['format'], d['name']))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                d = dict(name=r.name,
                         format=ext,
                         url=r.resolved_url,
                         mimetype=mimetype,
                         description=r.markdown)

                resources.append(d)
                prt("Adding {} resource {}".format(d['format'], d['name']))

    pkg['resources'] = resources

    c.action.package_update(**pkg)

    pkg = c.action.package_show(name_or_id=ckan_name)

    update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name))

    ##
    ## Add a term with CKAN info.

    doc['Root'].get_or_new_term('CkanId', pkg['id'])

    if org_name_slug is None and pkg.get('organization'):
        doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization')
                                                or {}).get('name'))

    groups = doc['Root'].find('Group')
    for g in groups:
        doc.remove_term(g)

    for group in pkg.get('groups', []):
        doc['Root'].new_term('Group', group['name'])

    write_doc(doc, m.mt_file)