Ejemplo n.º 1
0
def convert_documentation(nb_path):
    """Run only the document conversion portion of the notebook conversion

      The final document will not be completel
    """

    with open(nb_path) as f:
        nb = nbformat.reads(f.read(), as_version=4)

    doc = ExtractInlineMetatabDoc(package_url="metapack+file:" +
                                  dirname(nb_path)).run(nb)

    package_name = doc.as_version(None)

    output_dir = join(getcwd(), package_name)

    de = DocumentationExporter(config=Config(),
                               log=logger,
                               metadata=doc_metadata(doc))
    prt('Converting documentation')
    output, resources = de.from_filename(nb_path)

    fw = FilesWriter()

    fw.build_directory = join(output_dir, 'docs')
    fw.write(output, resources, notebook_name='notebook')
    prt("Wrote documentation to {}".format(fw.build_directory))
Ejemplo n.º 2
0
def convert_notebook(nb_path):
    prt('Convert notebook to Metatab source package')

    if not exists(nb_path):
        err("Notebook path does not exist: '{}' ".format(nb_path))

    c = Config()

    pe = NotebookExecutor(config=c, log=logger)

    prt('Running the notebook')
    output, resources = pe.from_filename(nb_path)

    fw = FilesWriter()
    fw.build_directory = pe.output_dir

    fw.write(output, resources, notebook_name=DEFAULT_METATAB_FILE)

    de = DocumentationExporter(config=c,
                               log=logger,
                               metadata=doc_metadata(pe.doc))

    prt('Exporting documentation')
    output, resources = de.from_filename(nb_path)

    fw.build_directory = join(pe.output_dir, 'docs')
    fw.write(output, resources, notebook_name='notebook')

    new_mt_file = join(pe.output_dir, DEFAULT_METATAB_FILE)

    doc = MetapackDoc(new_mt_file)

    de.update_metatab(doc, resources)

    for lib_dir in pe.lib_dirs:
        lib_dir = normpath(lib_dir).lstrip('./')

        doc['Resources'].new_term("Root.PythonLib", lib_dir)

        path = abspath(lib_dir)
        dest = join(pe.output_dir, lib_dir)

        ensure_dir(dest)
        copytree(path, join(pe.output_dir, lib_dir))

    doc.write_csv()

    # Reset the input to use the new data

    prt('Running with new package file: {}'.format(new_mt_file))
Ejemplo n.º 3
0
        def __init__(self, args):
            self.cwd = getcwd()
            self.args = args
            self.cache = get_cache('metapack')

            if args.metatabfile and args.metatabfile.startswith('#'):
                # It's just a fragment, default metatab file
                args.metatabfile = join(self.cwd, DEFAULT_METATAB_FILE) + args.metatabfile

            self.mtfile_arg = args.metatabfile if args.metatabfile else join(self.cwd, DEFAULT_METATAB_FILE)

            self.mtfile_url = Url(self.mtfile_arg)

            self.resource = self.mtfile_url.parts.fragment

            self.package_url, self.mt_file = resolve_package_metadata_url(self.mtfile_url.rebuild_url(False, False))
Ejemplo n.º 4
0
def make_metatab_file(template='metatab'):
    import metatab.templates as tmpl

    template_path = join(dirname(tmpl.__file__), template + '.csv')

    doc = MetatabDoc(template_path)

    return doc
Ejemplo n.º 5
0
def download_and_cache(spec, cache_fs, account_accessor=None, clean=False, logger=None,
                       working_dir='', callback=None):

    parts = {}

    working_dir = working_dir if working_dir else ''

    if spec.scheme == 'file':
        parts['cache_path'] = parse_url_to_dict(spec.resource_url)['path']
        parts['download_time'] = None

        locations = { # What a mess ...
            abspath(parts['cache_path']),
            abspath(parts['cache_path'].lstrip('/')),
            abspath(join(working_dir, parts['cache_path'])),
            abspath(parts['cache_path'].lstrip('/'))
        }

        for l in locations:
            if exists(l):
                parts['sys_path'] = l
                break
        else:
            raise DownloadError(("File resource does not exist. Found none of:"
                                "\n{}\n\nWorking dir = {}\ncache_path={}\nspec_path={}")
                          .format('\n'.join(locations), working_dir, parts['cache_path'], spec.path))

    else:
        cache_fs = cache_fs or get_cache()

        try:
            parts['cache_path'], parts['download_time'] = \
                download(spec.resource_url, cache_fs, account_accessor,
                         clean=clean, logger=logger, callback=callback)
        except AccessError as e:
            try:
                parts['cache_path'], parts['download_time'] = \
                    download(spec.auth_resource_url, cache_fs, account_accessor,
                             clean=clean, logger=logger, callback=callback)
            except AttributeError:
                raise e


        parts['sys_path'] = cache_fs.getsyspath(parts['cache_path'])

    return parts
Ejemplo n.º 6
0
    def __init__(self, args):

        self.cwd = getcwd()

        self.args = args

        self.downloader = Downloader.get_instance()

        self.cache = self.downloader.cache

        self.mtfile_arg = self.args.metatabfile if self.args.metatabfile else join(
            self.cwd, DEFAULT_METATAB_FILE)

        self.mtfile_url = MetapackUrl(self.mtfile_arg,
                                      downloader=self.downloader)

        self.resource = self.mtfile_url.target_file

        self.package_url = self.mtfile_url.package_url
        self.mt_file = self.mtfile_url.metadata_url

        self.package_root = self.package_url.join(PACKAGE_PREFIX)

        if not self.args.s3:
            doc = MetapackDoc(self.mt_file)
            self.args.s3 = doc['Root'].find_first_value('Root.S3')

        self.s3_url = parse_app_url(self.args.s3)

        if self.s3_url and not self.s3_url.scheme == 's3':
            self.s3_url = parse_app_url("s3://{}".format(self.args.s3))

        self.doc = MetapackDoc(self.mt_file)

        access_value = self.doc.find_first_value('Root.Access')

        self.acl = 'private' if access_value == 'private' else 'public-read'

        self.bucket = S3Bucket(
            self.s3_url, acl=self.acl,
            profile=self.args.profile) if self.s3_url else None
Ejemplo n.º 7
0
def metatab_build_handler(m):
    if m.args.create is not False:

        template = m.args.create if m.args.create else 'metatab'

        if not exists(m.mt_file):

            doc = make_metatab_file(template)

            doc['Root']['Identifier'] = six.text_type(uuid4())

            doc['Root']['Created'] = datetime_now()

            write_doc(doc, m.mt_file)

            prt('Created', m.mt_file)
        else:
            err('File', m.mt_file, 'already exists')

    if m.args.add:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        add_resource(m.mt_file, m.args.add, cache=m.cache)

    if False:  # m.args.resources:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        doc = MetatabDoc(m.mt_file)

        try:
            doc['Schema'].clean()
        except KeyError:
            pass

        for t in list(doc['Resources']):  # w/o list(), will iterate over new terms

            if not t.term_is('root.datafile'):
                continue

            if t.as_dict().get('url'):
                add_resource(doc, t.as_dict()['url'], m.cache)

            else:
                warn("Entry '{}' on row {} is missing a url; skipping".format(t.join, t.row))

        write_doc(doc, m.mt_file)

    if m.args.schemas:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        process_schemas(m.mt_file, cache=m.cache, clean=m.args.clean)

    if m.args.datapackage:
        update_name(m.mt_file, fail_on_missing=False, report_unchanged=False)

        from metatab.datapackage import convert_to_datapackage

        doc = MetatabDoc(m.mt_file)

        u = Url(m.mt_file)

        if u.proto == 'file':
            dpj_file = join(dirname(abspath(u.parts.path)), 'datapackage.json')
        else:
            dpj_file = join(getcwd(), 'datapackage.json')

        try:
            with open(dpj_file, 'w') as f:
                f.write(json.dumps(convert_to_datapackage(doc), indent=4))
        except ConversionError as e:
            err(e)

    if m.mtfile_url.scheme == 'file' and m.args.update:
        update_name(m.mt_file, fail_on_missing=True, force=m.args.force)
Ejemplo n.º 8
0
def download(url, cache_fs, account_accessor=None, clean=False, logger=None, callback=None):
    """
    Download a URL and store it in the cache.

    :param url:
    :param cache_fs:
    :param account_accessor: callable of one argument (url) returning dict with credentials.
    :param clean: Remove files from cache and re-download
    :param logger:
    :param callback:
    :return:
    """
    import os.path
    import time
    from fs.errors import DirectoryExpected, NoSysPath, ResourceInvalid, DirectoryExists

    assert isinstance(url, string_types)

    url = url.replace('\\','/')

    # .decode('utf8'). The fs modulegets upset when given strings, so 
    # we need to decode to unicode. UTF8 is a WAG.
    try:
        parsed = urlparse(url.decode('utf8'))
    except AttributeError:
        parsed = urlparse(url)

    # Create a name for the file in the cache, based on the URL
    # the '\' replacement is because pyfs only wants to use UNIX path seperators, but
    # python os.path.join will use the one specified for the operating system.
    cache_path = join(parsed.netloc, parsed.path.strip('/'))

    # If there is a query, hash it and add it to the path
    if parsed.query:
        hash = hashlib.sha224(parsed.query.encode('utf8')).hexdigest()
        cache_path = join(cache_path, hash)

    if not cache_fs.exists(cache_path):

        cache_dir = os.path.dirname(cache_path)

        try:
            cache_fs.makedirs(cache_dir, recreate=True)
        except DirectoryExpected as e:

            # Probably b/c the dir name is already a file
            dn = os.path.dirname(cache_path)
            bn = os.path.basename(cache_path)
            for i in range(10):
                try:
                    cache_path = join(dn + str(i), bn)
                    cache_fs.makedirs(os.path.dirname(cache_path))
                    break
                except DirectoryExpected:
                    continue
                except DirectoryExists:
                    pass # ? No idea what's supposed to happen here.
                raise e

    try:
        from filelock import FileLock
        lock = FileLock(cache_fs.getsyspath(cache_path + '.lock'))

    except NoSysPath:
        # mem: caches, and others, don't have sys paths.
        # FIXME should check for MP operation and raise if there would be
        # contention. Mem  caches are only for testing with single processes
        lock = _NoOpFileLock()

    with lock:
        if cache_fs.exists(cache_path):
            if clean:
                try:
                    cache_fs.remove(cache_path)
                except ResourceInvalid:
                    pass  # Well, we tried.
            else:
                return cache_path, None

        try:
            _download(url, cache_fs, cache_path, account_accessor, logger, callback)

            return cache_path, time.time()

        except HTTPError as e:
            if e.response.status_code == 403:
                raise AccessError("Access error on download: {}".format(e))
            else:
                raise DownloadError("Failed to download: {}".format(e))

        except (KeyboardInterrupt, Exception):
            # This is really important -- its really bad to have partly downloaded
            # files being confused with fully downloaded ones.
            # FIXME. Should also handle signals. deleteing partly downloaded files is important.
            # Maybe should have a sentinel file, or download to another name and move the
            # file after done.
            if cache_fs.exists(cache_path):
                cache_fs.remove(cache_path)

            raise

    assert False, 'Should never get here'