Esempio n. 1
0
def test_user(args):
    from botocore.exceptions import ClientError
    import boto3

    account = get_iam_account(l, args, args.user_name)

    if not account.access_key:
        err("Can't test user {}; library does not have record for account ( by arn ) "
            .format(args.user_name))

    session = boto3.Session(aws_access_key_id=account.access_key,
                            aws_secret_access_key=account.secret)

    root_s3 = get_resource(args, 's3')
    s3 = session.resource('s3')

    bn, prefix = split_bucket_name(args.bucket, default=None)

    root_bucket = root_s3.Bucket(bn)
    bucket = s3.Bucket(bn)

    prefixes = [prefix] if prefix else TOP_LEVEL_DIRS

    for prefix in prefixes:
        k = prefix + '/test/' + args.user_name
        rk = k + '-root'

        ro = root_bucket.put_object(Key=rk, Body=args.user_name)

        try:
            o = bucket.Object(rk)
            c = o.get()
            read = True
        except ClientError as e:
            read = False

        try:
            o = bucket.put_object(Key=k, Body=args.user_name)
            write = True
        except ClientError as e:
            write = False

        try:
            o.delete()
            delete = True
        except ClientError as e:
            delete = False

        #ro.delete()

        prt("{:<35s} {:<5s} {:<5s} {:<6s} {}".format(
            k, 'read' if read else '', 'write' if write else '',
            'delete' if delete else '', 'no access' if not any(
                (read, write, delete)) else ''))
Esempio n. 2
0
def metatab_derived_handler(m, skip_if_exists=None):
    """Create local Zip, Excel and Filesystem packages

    :param m:
    :param skip_if_exists:
    :return:
    """
    from metatab.package import PackageError

    create_list = []
    url = None

    doc = MetatabDoc(m.mt_file)

    env = get_lib_module_dict(doc)

    if (m.args.excel is not False or m.args.zip is not False or
            (hasattr(m.args, 'filesystem') and m.args.filesystem is not False) ):
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    if m.args.force:
        skip_if_exists = False

    try:

        # Always create a filesystem package before ZIP or Excel, so we can use it as a source for
        # data for the other packages. This means that Transform processes and programs only need
        # to be run once.
        if any([m.args.filesystem, m.args.excel, m.args.zip]):

            _, url, created = make_filesystem_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('fs', url, created))

            m.mt_file = url

            env = {}  # Don't need it anymore, since no more programs will be run.

        if m.args.excel is not False:
            _, url, created = make_excel_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('xlsx', url, created))

        if m.args.zip is not False:
            _, url, created = make_zip_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('zip', url, created))

        if m.args.csv is not False:
            _, url, created = make_csv_package(m.mt_file, m.cache, env, skip_if_exists)
            create_list.append(('csv', url, created))

    except PackageError as e:
        err("Failed to generate package: {}".format(e))

    return create_list
Esempio n. 3
0
def package_info(doc):

    client = dw.api_client()

    username = '******'

    title = doc.find_first_value("Root.Title")
    key = join(username, slugify(title))

    try:
        ds = client.get_dataset(key)
        prt(json.dumps(ds, indent=4))
    except RestApiError as e:
        err(e)
Esempio n. 4
0
def metatab_query_handler(m):
    if m.args.resource or m.args.head:

        limit = 20 if m.args.head else None

        try:
            doc = MetatabDoc(m.mt_file, cache=m.cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return

        if m.resource:
            dump_resource(doc, m.resource, limit)
        else:
            dump_resources(doc)
Esempio n. 5
0
def get_resource_urls(doc):

    resources = {}

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            prt("Skipping ZIP package ", package_url)

        elif u.resource_format == 'xlsx':
            resources[basename(package_url)] = package_url
            prt("Adding XLS package ", package_url)
            pass

        elif u.resource_format == 'csv':

            resources[basename(package_url)] = package_url

            prt("Adding CSV package {}".format(basename(package_url)))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                # '.csv': Data>world currently get the format from the name, not the URL
                resources[r.name + '.csv'] = r.resolved_url
                prt("Adding CSV resource {}".format(r.name))
        else:
            prt('Skipping {}'.format(package_url))

    return resources
Esempio n. 6
0
def metaworld():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metakan',
        description='Publish packages to Data.World, version {}'.format(
            _meta.__version__))

    parser.add_argument('-i',
                        '--info',
                        default=False,
                        action='store_true',
                        help="Show package information")

    parser.add_argument('metatabfile',
                        nargs='?',
                        default=DEFAULT_METATAB_FILE,
                        help='Path to a Metatab file')

    class MetapackCliMemo(object):
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

    m = MetapackCliMemo(parser.parse_args(sys.argv[1:]))

    try:
        doc = MetatabDoc(m.mt_file, cache=m.cache)
    except (IOError, MetatabError) as e:
        err("Failed to open metatab '{}': {}".format(m.mt_file, e))

    if m.args.info:
        package_info(doc)
    else:
        send_to_dw(doc)

    exit(0)
Esempio n. 7
0
    def write(self, body, path, acl=None):
        from botocore.exceptions import ClientError
        import mimetypes

        acl = acl if acl is not None else self._acl

        #if isinstance(body, six.string_types):
        #    with open(body,'rb') as f:
        #        body = f.read()

        key = join(self._prefix, path).strip('/')

        try:
            o = self._bucket.Object(key)
            if o.content_length == len(body):
                prt("File '{}' already in bucket; skipping".format(key))
                return self.access_url(path)
            else:
                prt("File '{}' already in bucket, but length is different; re-wirtting"
                    .format(key))

        except ClientError as e:
            if int(e.response['Error']['Code']) in (403, 405):
                err("S3 Access failed for '{}:{}': {}\nNOTE: With Docker, this error is often the result of container clock drift. Check your container clock. "
                    .format(self._bucket_name, key, e))
            elif int(e.response['Error']['Code']) != 404:
                err("S3 Access failed for '{}:{}': {}".format(
                    self._bucket_name, key, e))

        ct = mimetypes.guess_type(key)[0]

        try:
            self._bucket.put_object(
                Key=key,
                Body=body,
                ACL=acl,
                ContentType=ct if ct else 'binary/octet-stream')
        except Exception as e:
            self.err("Failed to write '{}': {}".format(key, e))

        return self.access_url(path)
Esempio n. 8
0
def delete_user(args):
    from botocore.exceptions import ClientError

    client = get_client(args, 'iam')

    try:
        resource = get_resource(args, 'iam')
        user = resource.User(args.user_name)

        for key in user.access_keys.all():
            prt("Deleting user key: {}".format(key))
            key.delete()

        for policy in user.policies.all():
            prt("Deleting user policy: {}".format(policy.name))
            policy.delete()

        response = client.delete_user(UserName=args.user_name)
        prt("Deleted user: {}".format(args.user_name))

    except ClientError as e:
        err("Could not delete user: {}".format(e))
Esempio n. 9
0
        def set_mt_arg(self, metatabfile):

            self.mtfile_arg = metatabfile if metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.api_key = self.args.api or getenv('METAKAN_API_KEY')

            self.ckan_url = self.args.ckan or getenv('METAKAN_CKAN_URL')

            if not self.ckan_url:
                err("Set the --ckan option or the METAKAN_CKAN_URL env var to set the URL of a ckan instance"
                    )

            if not self.api_key:
                err("Set the --api option METAKAN_API_KEY env var  with the API key to a CKAN instance"
                    )
Esempio n. 10
0
def metapack():
    import argparse

    parser = argparse.ArgumentParser(
        prog='metapack',
        description='Create and manipulate metatab data packages, version {}'.format(_meta.__version__))

    parser.add_argument('metatabfile', nargs='?',
                        help="Path or URL to a metatab file. If not provided, defaults to 'metadata.csv' ")

    parser.add_argument('-p', '--profile', help="Name of a BOTO or AWS credentails profile", required=False)

    parser.add_argument('--exceptions', default=False, action='store_true',
                             help='Show full stack tract for some unhandled exceptions')

    parser.set_defaults(handler=None)

    ##
    ## Build Group

    build_group = parser.add_argument_group('Building Metatab Files', 'Build and manage a metatab file for a pacakge')

    build_group.add_argument('-c', '--create', action='store', nargs='?', default=False,
                             help="Create a new metatab file, from named template. With no argument, uses the "
                                  "'metatab' template ")

    build_group.add_argument('-a', '--add', default=False,
                             help='Add a file or url to the resources. With a directory add a data files in the directory. '
                                  'If given a URL to a web page, will add all links that point to CSV, Excel Files and'
                                  'data files in ZIP files. (Caution: it will download and cache all of these files. )')

    # build_group.add_argument('-S', '--scrape',
    #                help='Similar to --add, but scrape a web page for links to data files, documentation '
    #                     'and web pages and add the links as resources ')

    # build_group.add_argument('-r', '--resources', default=False, action='store_true',
    #                    help='Rebuild the resources, intuiting rows and encodings from the URLs')

    build_group.add_argument('-s', '--schemas', default=False, action='store_true',
                             help='Rebuild the schemas for files referenced in the resource section')

    build_group.add_argument('-d', '--datapackage', action='store_true', default=False,
                             help="Write a datapackage.json file adjacent to the metatab file")

    build_group.add_argument('-u', '--update', action='store_true', default=False,
                             help="Update the Name from the Datasetname, Origin and Version terms")

    build_group.add_argument('-F', '--force', action='store_true', default=False,
                             help='Force some operations, like updating the name and building packages')

    ##
    ## Derived Package Group

    derived_group = parser.add_argument_group('Derived Packages', 'Generate other types of packages')

    derived_group.add_argument('-e', '--excel', action='store_true', default=False,
                               help='Create an excel archive from a metatab file')

    derived_group.add_argument('-z', '--zip', action='store_true', default=False,
                               help='Create a zip archive from a metatab file')

    derived_group.add_argument('-f', '--filesystem', action='store_true', default=False,
                               help='Create a filesystem archive from a metatab file')

    derived_group.add_argument('-v', '--csv', action='store_true', default=False,
                               help='Create a CSV archive from a metatab file')


    ##
    ## QueryPackage Group

    query_group = parser.add_argument_group('Query', 'Return information and data from a package')

    query_group.add_argument('-r', '--resource', default=False, action='store_true',
                             help='If the URL has no fragment, dump the resources listed in the metatab file.'
                                  ' With a fragment, dump a resource as a CSV')

    query_group.add_argument('-H', '--head', default=False, action='store_true',
                             help="Dump the first 20 lines of a resource ")

    ##
    ## Administration Group

    admin_group = parser.add_argument_group('Administration', 'Information and administration')

    admin_group.add_argument('--clean-cache', default=False, action='store_true',
                             help="Clean the download cache")

    admin_group.add_argument('-C', '--clean', default=False, action='store_true',
                             help="For some operations, like updating schemas, clear the section of existing terms first")

    admin_group.add_argument('-i', '--info', default=False, action='store_true',
                             help="Show configuration information")

    admin_group.add_argument('-n', '--name', default=False, action='store_true',
                             help="Print the name of the package")

    admin_group.add_argument('-E', '--enumerate',
                             help='Enumerate the resources referenced from a URL. Does not alter the Metatab file')

    admin_group.add_argument('--html', default=False, action='store_true',
                             help='Generate HTML documentation')

    admin_group.add_argument('--markdown', default=False, action='store_true',
                             help='Generate Markdown documentation')

    # cmd = parser.add_subparsers(title='Plugin Commands', help='Additional command supplied by plugins')
    # load_plugins(cmd)


    class MetapackCliMemo(object):
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            if args.metatabfile and args.metatabfile.startswith('#'):
                # It's just a fragment, default metatab file
                args.metatabfile = join(self.cwd, DEFAULT_METATAB_FILE) + args.metatabfile

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)

            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(self.mtfile_url.rebuild_url(False, False))

    m = MetapackCliMemo(parser.parse_args(sys.argv[1:]))

    if m.args.info:
        metatab_info(m.cache)
        exit(0)

    if m.args.profile:
        from metatab.s3 import set_s3_profile
        set_s3_profile(m.args.profile)

    try:
        for handler in (metatab_build_handler, metatab_derived_handler, metatab_query_handler, metatab_admin_handler):
            handler(m)
    except Exception as e:
        if m.args.exceptions:
            raise e
        else:
            err(e)

    clean_cache(m.cache)
Esempio n. 11
0
def metatab_build_handler(m):
    if m.args.create is not False:

        template = m.args.create if m.args.create else 'metatab'

        if not exists(m.mt_file):

            doc = make_metatab_file(template)

            doc['Root']['Identifier'] = six.text_type(uuid4())

            doc['Root']['Created'] = datetime_now()

            write_doc(doc, m.mt_file)

            prt('Created', m.mt_file)
        else:
            err('File', m.mt_file, 'already exists')

    if m.args.add:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        add_resource(m.mt_file, m.args.add, cache=m.cache)

    if False:  # m.args.resources:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        doc = MetatabDoc(m.mt_file)

        try:
            doc['Schema'].clean()
        except KeyError:
            pass

        for t in list(doc['Resources']):  # w/o list(), will iterate over new terms

            if not t.term_is('root.datafile'):
                continue

            if t.as_dict().get('url'):
                add_resource(doc, t.as_dict()['url'], m.cache)

            else:
                warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row))

        write_doc(doc, m.mt_file)

    if m.args.schemas:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean)

    if m.args.datapackage:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        from metatab.datapackage import convert_to_datapackage

        doc = MetatabDoc(m.mt_file)

        u = Url(m.mt_file)

        if u.proto == 'file':
            dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json')
        else:
            dpj_file = join(getcwd(), 'datapackage.json')

        try:
            with open(dpj_file, 'w') as f:
                f.write(json.dumps(convert_to_datapackage(doc), indent=4))
        except ConversionError as e:
            err(e)

    if m.mtfile_url.scheme == 'file' and m.args.update:
        update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
Esempio n. 12
0
def metatab():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metatab',
        description='Matatab file parser, version {}'.format(_meta.__version__))

    parser.add_argument('-C', '--clean-cache', default=False, action='store_true',
                        help="Clean the download cache")

    g = parser.add_mutually_exclusive_group(required=True)

    g.add_argument('-i', '--info', default=False, action='store_true',
                   help="Show configuration information")

    g.add_argument('-c', '--create', action='store', nargs='?', default=False,
                   help="Create a new metatab file, from named template. With no argument, uses the 'metatab' template ")

    g.add_argument('-t', '--terms', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, before interpretation')

    g.add_argument('-I', '--interp', default=False, action='store_true',
                   help='Parse a file and print out the stream of terms, after interpretation')

    g.add_argument('-j', '--json', default=False, action='store_true',
                   help='Parse a file and print out a JSON representation')

    g.add_argument('-y', '--yaml', default=False, action='store_true',
                   help='Parse a file and print out a YAML representation')

    g.add_argument('-R', '--resource', default=False, action='store_true',
                   help='If the URL has no fragment, dump the resources listed in the metatab file. With a fragment, dump a resource as a CSV')

    g.add_argument('-H', '--head', default=False, action='store_true',
                   help="Dump the first 20 lines of a resoruce ")

    g.add_argument('-S', '--schema',
                   help='Dump the schema for one named resource')

    parser.add_argument('-d', '--show-declaration', default=False, action='store_true',
                        help='Parse a declaration file and print out declaration dict. Use -j or -y for the format')

    parser.add_argument('-D', '--declare', help='Parse and incorporate a declaration before parsing the file.' +
                                                ' (Adds the declaration to the start of the file as the first term. )')

    parser.add_argument('file', nargs='?', default=DEFAULT_METATAB_FILE, help='Path to a Metatab file')

    args = parser.parse_args(sys.argv[1:])

    # Specing a fragment screws up setting the default metadata file name
    if args.file.startswith('#'):
        args.file = DEFAULT_METATAB_FILE + args.file

    cache = get_cache('metapack')

    if args.info:
        prt('Version  : {}'.format(_meta.__version__))
        prt('Cache dir: {}'.format(str(cache.getsyspath('/'))))
        exit(0)

    if args.clean_cache:
        clean_cache(cache)

    if args.create is not False:
        new_metatab_file(args.file, args.create)
        exit(0)

    if args.resource or args.head:

        limit = 20 if args.head else None

        u = Url(args.file)
        resource = u.parts.fragment
        metadata_url = u.rebuild_url(False, False)

        package_url, metadata_url = resolve_package_metadata_url(metadata_url)

        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except OSError as e:
            err("Failed to open Metatab doc: {}".format(e))
            return # Never reached

        if resource:
            dump_resource(doc, resource, limit)
        else:
            dump_resources(doc)


        exit(0)

    if args.show_declaration:

        doc = MetatabDoc()
        doc.load_declarations([args.file])

        print(json.dumps({
            'terms': doc.decl_terms,
            'sections': doc.decl_sections
        }, indent=4))
        exit(0)
    else:

        package_url, metadata_url = resolve_package_metadata_url(args.file)
        try:
            doc = MetatabDoc(metadata_url, cache=cache)
        except IOError as e:
            raise
            err("Failed to open '{}': {}".format(metadata_url, e))

    if args.terms:
        for t in doc._term_parser:
            print(t)

    elif args.json:
        print(json.dumps(doc.as_dict(), indent=4))


    elif args.yaml:
        import yaml
        print(yaml.safe_dump(doc.as_dict(), default_flow_style=False, indent=4))


    elif args.schema:
        dump_schema(doc, args.schema)

    exit(0)
Esempio n. 13
0
def create_packages(m, second_stage_mtfile, distupdated=None):
    """ Create Excel, ZIP, FS and CSV packages for upload to S3

    :param m: CLI Arguments object
    :param second_stage_mtfile: Path to a Metatab file, which must have distribution entries
    :param skip_if_exists: If True, don't recreate the file if exists.
    :return:
    """

    create_list = []
    url = None

    doc = MetatabDoc(second_stage_mtfile)

    access_value = doc.find_first_value('Root.Access')

    if access_value == 'private':
        acl = 'private'
    else:
        acl = 'public-read'

    # Only the first Filesystem nees an env; the others won't need to run processing, since they
    # are building from processed files.
    env = {}

    s3 = S3Bucket(m.args.s3, acl=acl, profile=m.args.profile)

    urls = []

    if (m.args.excel is not False or m.args.zip is not False or
        (hasattr(m.args, 'filesystem') and m.args.filesystem is not False)):
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    if m.args.force or distupdated is True:
        skip_if_exists = False
    else:
        skip_if_exists = True

    try:

        # Always create a filesystem package before ZIP or Excel, so we can use it as a source for
        # data for the other packages. This means that Transform processes and programs only need
        # to be run once.

        _, third_stage_mtfile, created = make_filesystem_package(
            second_stage_mtfile, m.cache, get_lib_module_dict(doc),
            skip_if_exists)

        if m.args.excel is not False:
            _, ex_url, created = make_excel_package(third_stage_mtfile,
                                                    m.cache, env,
                                                    skip_if_exists)
            with open(ex_url, mode='rb') as f:
                urls.append(('excel', s3.write(f.read(), basename(ex_url),
                                               acl)))

        if m.args.zip is not False:
            _, zip_url, created = make_zip_package(third_stage_mtfile, m.cache,
                                                   env, skip_if_exists)
            with open(zip_url, mode='rb') as f:
                urls.append(('zip', s3.write(f.read(), basename(zip_url),
                                             acl)))

        # Note! This is a FileSystem package on the remote S3 bucket, not locally
        if m.args.fs is not False:
            try:
                fs_p, fs_url, created = make_s3_package(
                    third_stage_mtfile, m.args.s3, m.cache, env, acl,
                    skip_if_exists)
            except NoCredentialsError:
                print(getenv('AWS_SECRET_ACCESS_KEY'))
                err("Failed to find boto credentials for S3. "
                    "See http://boto3.readthedocs.io/en/latest/guide/configuration.html "
                    )

            urls.append(('fs', fs_url))

        # Make the CSV package from the filesystem package on S3; this will ensure that the
        # package's resource URLs point to the S3 objects
        if m.args.csv is not False:

            # Using the signed url in case the bucket is private
            p = CsvPackage(fs_p.access_url, cache=m.tmp_cache)
            csv_url = p.save(PACKAGE_PREFIX)
            with open(csv_url, mode='rb') as f:
                urls.append(('csv', s3.write(f.read(), basename(csv_url),
                                             acl)))

    except PackageError as e:
        err("Failed to generate package: {}".format(e))

    return urls
Esempio n. 14
0
def metasync():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metasync',
        description='Create packages and store them in s3 buckets, version {}'.
        format(_meta.__version__),
    )

    parser.add_argument('-i',
                        '--info',
                        default=False,
                        action='store_true',
                        help="Show configuration information")

    parser.add_argument('-v',
                        '--verbose',
                        default=False,
                        action='store_true',
                        help="For some command, be more verbose")

    parser.add_argument(
        '-F',
        '--force',
        action='store_true',
        default=False,
        help='Force building packages, even when they already exist')

    parser.add_argument('-p',
                        '--profile',
                        help="Name of a BOTO or AWS credentails profile",
                        required=False)

    parser.add_argument('-s',
                        '--s3',
                        help="URL to S3 where packages will be stored",
                        required=False)

    parser.add_argument('-S',
                        '--all-s3',
                        help="Synonym for `metasync -c -e -f -z -s <url>`",
                        required=False)

    parser.add_argument(
        '-e',
        '--excel',
        action='store_true',
        default=False,
        help='Create an excel package from a metatab file and copy it to S3. ')

    parser.add_argument(
        '-z',
        '--zip',
        action='store_true',
        default=False,
        help='Create a zip package from a metatab file and copy it to S3. ')

    parser.add_argument(
        '-c',
        '--csv',
        action='store_true',
        default=False,
        help=
        'Create a csv package from a metatab file and copy it to S3. Requires building a file system package'
    )

    parser.add_argument(
        '-f',
        '--fs',
        action='store_true',
        default=False,
        help=
        'Create a Filesystem package. Unlike -e and -f, only writes the package to S3.'
    )

    parser.add_argument('-D',
                        '--docker',
                        help="Re-run the metasync command through docker",
                        action='store_true',
                        default=False)

    parser.add_argument(
        '-C',
        '--credentials',
        help="Show S3 Credentials and exit. "
        "Eval this string to setup credentials in other shells.",
        action='store_true',
        default=False)

    parser.add_argument('metatabfile',
                        nargs='?',
                        help='Path to a Metatab file')

    class MetapackCliMemo(object):
        def __init__(self, raw_args):
            self.cwd = getcwd()

            self.raw_args = raw_args

            self.args = parser.parse_args(self.raw_args[1:])

            self.cache = get_cache('metapack')

            # This one is for loading packages that have just been
            # written to S3.
            self.tmp_cache = get_cache('temp')
            clean_cache(self.tmp_cache)

            if self.args.all_s3:
                self.args.s3 = self.args.all_s3
                self.args.excel = True
                self.args.zip = True
                self.args.csv = True
                self.args.fs = True

            self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join(
                self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)
            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(
                self.mtfile_url.rebuild_url(False, False))

            self.args.fs = self.args.csv or self.args.fs

    m = MetapackCliMemo(sys.argv)

    if m.args.credentials:
        show_credentials(m.args.profile)
        exit(0)

    if m.args.docker:
        run_docker(m)

    if m.args.info:
        metatab_info(m.cache)
        exit(0)

    if not m.args.s3:
        doc = MetatabDoc(m.mt_file)
        m.args.s3 = doc['Root'].find_first_value('Root.S3')

    if not m.args.s3:
        err("Must specify either -S or -s")

    if m.args.excel is not False or m.args.zip is not False or m.args.fs is not False:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

    doc = MetatabDoc(m.mt_file)
    doc['Root'].get_or_new_term('Root.S3', m.args.s3)
    write_doc(doc, m.mt_file)

    second_stage_mtfile, distupdated = update_distributions(m)

    if second_stage_mtfile != m.mt_file:
        prt("Building packages from: ", second_stage_mtfile)

    created = create_packages(m, second_stage_mtfile, distupdated=distupdated)

    prt("Synchronized these Package Urls")
    prt(tabulate(created))

    exit(0)
Esempio n. 15
0
from os.path import join, basename

from metatab import _meta, DEFAULT_METATAB_FILE, resolve_package_metadata_url, MetatabDoc, MetatabError, open_package
from metatab.cli.core import err
from rowgenerators import get_cache, Url
from .core import prt

from metatab.util import slugify
import json
import mimetypes

try:
    import datadotworld as dw
    from datadotworld.client.api import RestApiError
except ImportError:
    err("To run the Metataworld importer, you must first install the datadotworld package. See https://github.com/datadotworld/data.world-py"
        )


def metaworld():
    import argparse
    parser = argparse.ArgumentParser(
        prog='metakan',
        description='Publish packages to Data.World, version {}'.format(
            _meta.__version__))

    parser.add_argument('-i',
                        '--info',
                        default=False,
                        action='store_true',
                        help="Show package information")
Esempio n. 16
0
def send_to_ckan(m):

    from ckanapi import RemoteCKAN, NotFound
    try:
        doc = MetatabDoc(m.mt_file, cache=m.cache)
    except (IOError, MetatabError) as e:
        err("Failed to open metatab '{}': {}".format(m.mt_file, e))

    c = RemoteCKAN(m.ckan_url, apikey=m.api_key)

    ckanid = doc.find_first_value('Root.Ckanid')
    identifier = doc.find_first_value('Root.Identitfier')
    name = doc.find_first('Root.Name')

    ckan_name = name.value.replace('.', '-')

    id_name = ckanid or ckan_name

    try:
        pkg = c.action.package_show(name_or_id=id_name)
        prt("Updating CKAN dataset for '{}'".format(ckan_name))
    except NotFound:
        pkg = c.action.package_create(name=ckan_name, package_id=identifier)
        prt("Adding CKAN dataset for '{}'".format(ckan_name))

    pkg['title'] = doc.find_first_value('Root.Title')

    if not pkg['title']:
        pkg['title'] = doc.find_first_value('Root.Description')

    try:
        pkg['notes'] = doc.markdown  #doc.find_first_value('Root.Description')
    except OSError as e:
        warn(e)

    pkg['version'] = name.properties.get('version')

    pkg['groups'] = [{'name': g.value} for g in doc['Root'].find('Root.Group')]

    pkg['tags'] = [{'name': g.value} for g in doc['Root'].find('Root.Tag')]

    def get_org(name):

        if not name:
            return None

        try:
            return
        except NotFound:
            return None

    org_name = name.get('Origin', doc['Root'].find_first_value('Root.CkanOrg'))

    if org_name:
        org_name_slug = org_name.value.replace('.', '-')
        try:

            owner_org = c.action.organization_show(id=org_name_slug).get('id')
            pkg['owner_org'] = owner_org
        except NotFound:
            warn("Didn't find org for '{}'; not setting organization ".format(
                org_name_slug))
            org_name_slug = None
    else:
        org_name_slug = None

    extras = {}

    for t in doc.find('*.*', section='Root'):
        if not t.term_is('Root.Distribution'):
            extras[t.qualified_term] = t.value

    for t in name.children:
        extras[t.qualified_term] = t.value

    pkg['extras'] = [{'key': k, 'value': v} for k, v in extras.items()]

    resources = []

    for dist in doc.find("Root.Distribution"):

        package_url, metadata_url = resolve_package_metadata_url(dist.value)

        u = Url(package_url)

        if u.resource_format == 'zip':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='ZIP',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='ZIP version of package')
            resources.append(d)
            prt("Adding ZIP package ", d['name'])

        elif u.resource_format == 'xlsx':
            d = dict(url=package_url,
                     name=basename(package_url),
                     format='XLSX',
                     mimetype=mimetypes.guess_type(package_url)[0],
                     description='Excel version of package')
            resources.append(d)
            prt("Adding XLS package ", d['name'])

        elif u.resource_format == 'csv':

            d = dict(url=package_url,
                     name=basename(package_url),
                     format='csv',
                     mimetype=mimetypes.guess_type(metadata_url)[0],
                     description='CSV Package Metadata in Metatab format')

            resources.append(d)
            prt("Adding {} package {}".format(d['format'], d['name']))

            try:
                p = open_package(package_url)
            except (IOError, MetatabError) as e:
                err("Failed to open package '{}' from reference '{}': {}".
                    format(package_url, dist.value, e))

            for r in p.resources():

                mimetype = mimetypes.guess_type(r.resolved_url)[0]

                try:
                    ext = mimetypes.guess_extension(mimetype)[1:]
                except:
                    ext = None

                d = dict(name=r.name,
                         format=ext,
                         url=r.resolved_url,
                         mimetype=mimetype,
                         description=r.markdown)

                resources.append(d)
                prt("Adding {} resource {}".format(d['format'], d['name']))

    pkg['resources'] = resources

    c.action.package_update(**pkg)

    pkg = c.action.package_show(name_or_id=ckan_name)

    update_dist(doc, [], join(m.ckan_url, 'dataset', ckan_name))

    ##
    ## Add a term with CKAN info.

    doc['Root'].get_or_new_term('CkanId', pkg['id'])

    if org_name_slug is None and pkg.get('organization'):
        doc['Root'].get_or_new_term('CkanOrg', (pkg.get('organization')
                                                or {}).get('name'))

    groups = doc['Root'].find('Group')
    for g in groups:
        doc.remove_term(g)

    for group in pkg.get('groups', []):
        doc['Root'].new_term('Group', group['name'])

    write_doc(doc, m.mt_file)