def test_ImageProvider_retrieve(self):
        from pytsammalex.image_providers.base import ImageProvider

        repos = create_repos(self.tmp_path())
        fname = self.tmp_path('test')
        media = self.tmp_path(('media.json'))
        with fname.open('w', encoding='utf8') as fp:
            fp.write('test')

        class P(ImageProvider):
            def __init__(self, md):
                self._md = md
                ImageProvider.__init__(self, repos)

            def metadata(self, item):
                return self._md

        self.assertIsNone(P({}).retrieve(None, None, None, None))

        staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'})
        prov = P({'source_url': fname})
        with self.assertRaises(ValueError):
            prov.retrieve(staged_image, None, [md5(fname)], None)

        cdstar = MagicMock(
            create=MagicMock(return_value=[(None, None, MOCK_CDSTAR_OBJECT)]))
        prov = P({'source_url': 'x'})
        with patch('pytsammalex.util.requests', MockRequests()):
            with MediaCatalog(media.name, repos=repos) as mcat:
                prov.retrieve(staged_image, cdstar, [], mcat)
        self.assertTrue(cdstar.create.called)
        self.assertEqual(len(MediaCatalog(media.name, repos=repos)), 1)
Beispiel #2
0
def _download_sql_dump(rel, log):
    target = Path('glottolog-{0}.sql.gz'.format(rel['version']))
    log.info('retrieving {0}'.format(rel['sql_dump_url']))
    urlretrieve(rel['sql_dump_url'], target.as_posix())
    assert md5(target) == rel['sql_dump_md5']
    unpacked = target.with_suffix('')
    with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u:
        shutil.copyfileobj(f, u)
    target.unlink()
    log.info('SQL dump for Glottolog release {0} written to {1}'.format(
        rel['version'], unpacked))
Beispiel #3
0
def _download_sql_dump(rel, log):
    target = Path('glottolog-{0}.sql.gz'.format(rel['version']))
    log.info('retrieving {0}'.format(rel['sql_dump_url']))
    urlretrieve(rel['sql_dump_url'], target.as_posix())
    assert md5(target) == rel['sql_dump_md5']
    unpacked = target.with_suffix('')
    with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u:
        shutil.copyfileobj(f, u)
    target.unlink()
    log.info('SQL dump for Glottolog release {0} written to {1}'.format(
        rel['version'], unpacked))
Beispiel #4
0
 def download_sql_dump(self, log):
     target = self.dump_fname(zipped=True)
     log.info('retrieving {0}'.format(self.sql_dump_url))
     urlretrieve(self.sql_dump_url, str(target))
     assert md5(target) == self.sql_dump_md5
     unpacked = target.with_suffix('')
     with gzip.open(str(target)) as f, unpacked.open('wb') as u:
         shutil.copyfileobj(f, u)
     target.unlink()
     log.info('SQL dump for Glottolog release {0} written to {1}'.format(
         self.version, unpacked))
Beispiel #5
0
    def _upload(self, sfn, files):
        """
        Upload a files for SoundfileName sfn.
        """
        print(sfn)
        # Lookup the SoundfileName in catalog:
        cat_obj = self[sfn] if sfn in self else None
        # Retrieve or create the corresponding CDSTAR object:
        obj = self.api.get_object(cat_obj.id if cat_obj else None)
        print(obj.id)
        md = {'collection': 'soundcomparisons', 'name': sfn, 'type': 'soundfile'}
        changed = False
        if not cat_obj:  # If the object is already in the catalog, the metadata does not change!
            obj.metadata = md
        for f in files:
            fmt = f.suffix[1:]
            if fmt not in self.mimetypes:
                continue
            create = True
            if cat_obj:
                for cat_bitstream in cat_obj.bitstreams:
                    if cat_bitstream.id.endswith(f.suffix):
                        # A bitstream for this mimetype already exists!
                        if cat_bitstream.md5 == md5(f):
                            # If the md5 sum is the same, don't bother uploading!
                            create = False
                        else:
                            # Otherwise we have to delete the old bitstream before uploading the
                            # new one.
                            for bs in obj.bitstreams:
                                if bs.id == cat_bitstream.id:
                                    bs.delete()
                                    break
                        break

            if create:
                changed = True
                print('uploading {0}'.format(f.name))
                obj.add_bitstream(fname=str(f), name=f.name, mimetype=self.mimetypes[fmt])
                time.sleep(0.1)
            else:
                print('skipping {0}'.format(f.name))

        if changed:
            obj.read()
            self.add(obj, metadata=md, update=True)
Beispiel #6
0
    def retrieve(self, item, cdstar_catalog, checksums, mediacatalog):
        """
        - download
        - compute checksum
        - upload to CDSTAR
        - add to cdstar.json

        :return: Image instance
        """
        md = self.metadata(item) or {}
        source_url = md.pop('source_url', None)
        if not source_url:
            return
        # We turn the Staged_images instance into a `dict`, which we will enrich and then
        # turn into an Images instance.
        item = dict(zip(item.fields(), item.csv_row()))
        with TemporaryDirectory() as tmp:
            if isinstance(source_url, Path):
                fname = tmp.joinpath(source_url.name)
                copy(source_url, fname)
            else:
                # download the thing
                fname = self._download(source_url, tmp)
                if not fname:
                    return
            checksum = md5(fname)
            if checksum in checksums:
                raise ValueError('duplicate item {0} {1}'.format(item['id'], checksum))
            item.update(md)
            item['id'] = checksum
            item['collection'] = 'Tsammalex'
            img = Images.fromdict(item)
            if checksum not in mediacatalog.items:
                # now upload to CDSTAR
                _, _, obj = list(cdstar_catalog.create(fname, item))[0]
                mediacatalog.add(obj)
            return img
def test_image_provider_retrieve(tmpdir):
    repos = create_repos(tmpdir)
    fname = tmpdir.join('test')

    with fname.open('w', encoding='utf8') as fp:
        fp.write('test')

    class TestProvider(ImageProvider):
        def identify(self, name):
            pass

        def __init__(self, md):
            self._md = md
            ImageProvider.__init__(self, repos)

        def metadata(self, item):
            return self._md

    assert (TestProvider({}).retrieve(None, None, None, None) is None)

    staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'})
    prov = TestProvider({'source_url': fname})

    with pytest.raises(ValueError):
        prov.retrieve(staged_image, None, [md5(fname)], None)

    cdstar = MagicMock(
        create=MagicMock(return_value=[(None, None, MOCK_CDSTAR_OBJECT)]))
    prov = TestProvider({'source_url': 'x'})

    with patch('pytsammalex.util.requests', MockRequests()):
        with MediaCatalog('media.json', repos=Path(repos)) as mcat:
            prov.retrieve(staged_image, cdstar, [], mcat)

    assert (cdstar.create.called is True)
    assert (len(MediaCatalog('media.json', repos=Path(repos))) == 1)
Beispiel #8
0
def test_image_provider_retrieve(tmpdir):
    repos = create_repos(tmpdir)
    fname = tmpdir.join('test')

    with fname.open('w', encoding='utf8') as fp:
        fp.write('test')

    class TestProvider(ImageProvider):
        def identify(self, name):
            pass

        def __init__(self, md):
            self._md = md
            ImageProvider.__init__(self, repos)

        def metadata(self, item):
            return self._md

    assert (TestProvider({}).retrieve(None, None, None, None) is None)

    staged_image = Staged_images.fromdict({'id': 'abc', 'taxa__id': 'abc'})
    prov = TestProvider({'source_url': fname})

    with pytest.raises(ValueError):
        prov.retrieve(staged_image, None, [md5(fname)], None)

    cdstar = MagicMock(create=MagicMock(return_value=[(None, None,
                                                       MOCK_CDSTAR_OBJECT)]))
    prov = TestProvider({'source_url': 'x'})

    with patch('pytsammalex.util.requests', MockRequests()):
        with MediaCatalog('media.json', repos=Path(repos)) as mcat:
            prov.retrieve(staged_image, cdstar, [], mcat)

    assert (cdstar.create.called is True)
    assert (len(MediaCatalog('media.json', repos=Path(repos))) == 1)
Beispiel #9
0
def includeme(config):
    """Upgrading:

    - register utilities "by hand", after config.include('clld.web.app')
    - add routes by hand (and remove these from the **kw passed to Configurator)

    :param config:
    :return:
    """
    #
    # now we exploit the default package layout as created via the CLLD scaffold:
    #
    # note: the following exploits the import time side effect of modifying the webassets
    # environment!
    root_package = config.root_package.__name__
    pkg_dir = Path(config.root_package.__file__).parent.resolve()
    maybe_import('%s.assets' % root_package, pkg_dir=pkg_dir)

    json_renderer = JSON()
    json_renderer.add_adapter(datetime.datetime,
                              lambda obj, req: obj.isoformat())
    json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('json', json_renderer)

    jsonp_renderer = JSONP(param_name='callback')
    jsonp_renderer.add_adapter(datetime.datetime,
                               lambda obj, req: obj.isoformat())
    jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('jsonp', jsonp_renderer)

    config.set_request_factory(ClldRequest)
    config.registry.registerUtility(CtxFactoryQuery(),
                                    interfaces.ICtxFactoryQuery)
    config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig)
    config.registry.registerUtility(CldfConfig(), interfaces.ICldfConfig)

    # initialize the db connection
    engine = engine_from_config(config.registry.settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.bind = engine

    try:
        git_tag = git_describe(Path(pkg_dir).parent)
    except ValueError:  # pragma: no cover
        git_tag = None

    config.add_settings({
        'pyramid.default_locale_name': 'en',
        'clld.pkg': root_package,
        'clld.git_tag': git_tag,
        'clld.parameters': {}
    })
    if 'clld.files' in config.registry.settings:
        # deployment-specific location of static data files
        abspath = Path(config.registry.settings['clld.files']).resolve()
        config.add_settings({'clld.files': abspath})
        config.add_static_view('files', str(abspath))

    # event subscribers:
    config.add_subscriber(add_localizer, events.NewRequest)
    config.add_subscriber(init_map, events.ContextFound)
    config.add_subscriber(
        partial(add_renderer_globals,
                maybe_import('%s.util' % root_package, pkg_dir=pkg_dir)),
        events.BeforeRender)

    #
    # make it easy to register custom functionality
    #
    for name, func in {
            'register_utility': register_utility,
            'register_datatable': partial(register_cls, interfaces.IDataTable),
            'register_map': partial(register_cls, interfaces.IMap),
            'register_menu': register_menu,
            'register_resource': register_resource,
            'register_adapter': register_adapter,
            'register_adapters': register_adapters,
            'register_download': register_download,
            'register_staticresource': register_staticresource,
            'add_route_and_view': add_route_and_view,
            'add_settings_from_file': add_settings_from_file,
            'add_301': add_301,
            'add_410': add_410,
            'add_page': add_page,
            'register_resource_routes_and_views':
            register_resource_routes_and_views,
    }.items():
        config.add_directive(name, func)

    #
    # routes and views
    #
    config.add_static_view('clld-static', 'clld:web/static')
    config.add_static_view('static', '%s:static' % root_package)

    config.add_route_and_view('_js', '/_js', js, http_cache=3600)

    # add some maintenance hatches
    config.add_route_and_view('_raise', '/_raise', _raise)
    config.add_route_and_view('_ping', '/_ping', _ping, renderer='json')

    # sitemap support:
    config.add_route_and_view('robots', '/robots.txt', robots)
    config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex)
    config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap)
    config.add_route('resourcemap', '/resourcemap.json')
    config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp')
    config.add_route_and_view('select_combination', '/_select_combination',
                              select_combination)

    config.add_route_and_view('unapi', '/unapi', unapi)
    config.add_route_and_view('olac', '/olac', olac)

    config.add_settings_from_file(pkg_dir.joinpath('appconf.ini'))
    if not config.registry.settings.get('mako.directories'):
        config.add_settings({'mako.directories': ['clld:web/templates']})

    for rsc in RESOURCES:
        config.register_resource_routes_and_views(rsc)
        config.register_datatable(
            rsc.plural, getattr(datatables, rsc.plural.capitalize(),
                                DataTable))
        register_resource_adapters(config, rsc)

    # maps
    config.register_map('languages', Map)
    config.register_map('language', LanguageMap)
    config.register_map('parameter', ParameterMap)
    config.register_map('combination', CombinationMap)

    config.include('clld.web.adapters')

    for icon in ICONS:
        config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name)
    config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList)
    config.registry.registerUtility(MapMarker(), interfaces.IMapMarker)

    #
    # inspect default locations for views and templates:
    #
    home_comp = OrderedDict()
    for name, template in [
        ('introduction', False),
        ('about', False),
        ('terms', False),
        ('glossary', False),
        ('history', False),
        ('changes', False),
        ('credits', False),
        ('legal', True),
        ('download', True),
        ('contact', True),
        ('help', False),
    ]:
        home_comp[name] = template

    if pkg_dir.joinpath('templates').exists():
        for p in pkg_dir.joinpath('templates').iterdir():
            if p.stem in home_comp and p.suffix == '.mako':
                home_comp[p.stem] = True

    for name, template in home_comp.items():
        if template:
            config.add_page(name)

    config.add_settings(
        {'home_comp': [k for k in home_comp.keys() if home_comp[k]]})

    if 'clld.favicon' not in config.registry.settings:
        favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'}
        # hard to test (in particular on travis) and without too much consequence
        # (and the consequences faced are easy to spot).
        if pkg_dir.joinpath('static',
                            'favicon.ico').exists():  # pragma: no cover
            favicon['clld.favicon'] = root_package + ':static/favicon.ico'
        config.add_settings(favicon)

    config.add_settings({
        'clld.favicon_hash':
        md5(abspath_from_asset_spec(config.registry.settings['clld.favicon']))
    })

    translation_dirs = ['clld:locale']
    if pkg_dir.joinpath('locale').exists():
        translation_dirs.append('%s:locale' % root_package)  # pragma: no cover
    config.add_translation_dirs(*translation_dirs)

    if pkg_dir.joinpath(
            'static/publisher_logo.png').exists():  # pragma: no cover
        config.add_settings({
            'clld.publisher_logo':
            '%s:static/publisher_logo.png' % root_package
        })

    if asbool(config.registry.settings.get('clld.pacific_centered_maps')):
        geojson.pacific_centered()

    v = maybe_import('%s.views' % root_package, pkg_dir=pkg_dir)
    if v:
        config.scan(v)  # pragma: no cover

    menuitems = config.registry.settings.get(
        'clld.menuitems_list',
        ['contributions', 'parameters', 'languages', 'contributors'])
    config.register_menu(('dataset', dict(label='Home')), *menuitems)

    config.include('pyramid_mako')

    for name in ['adapters', 'datatables', 'maps']:
        mod = maybe_import('%s.%s' % (root_package, name), pkg_dir=pkg_dir)
        if mod and hasattr(mod, 'includeme'):
            config.include(mod)

    config.register_download(CldfDownload(common.Dataset, root_package))
Beispiel #10
0
def downloadSoundFiles(args, out_path=os.path.join(os.getcwd(), "sound"), db_needed=False):
    """
    Downloads desired sound files as {sound/}FilePathPart/FilePathPart_WordID.EXT from CDSTAR
    to {current_folder}/sound or to out_path if passed.
    As default it downloads all stored sound files, with the argument {EXT} you can pass desired
    sound file extensions
    Usage:
    --sc-repo {--db-host --db-name --db-user --db-password} downloadSoundFiles ITEM {EXT}
      Valid ITEMs:
        UID(s): EAEA0-3A11-8354-556E-0 EAEA0-303B-3625-4014-0 ...
        Study Name(s): Brazil Europe ...
        FilePathPart(s): Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl ...
        FilePathPart(s)+Word: Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl_909_praised_maalato ...
        FilePathPart(s)+Word.EXT: Clt_Bryth_Wel_Dyfed_Pem_Maenclochog_Dl_909_praised_maalato.mp3 ...
        Language_Index: 11121250509 11131000008 ...
      Valid EXTs: mp3 ogg wav
        (if an extension is not stored it falls back to the first ext mentioned in catalog,
         otherwise no sound file)

    db_needed = False if all items can be calculated as keys of catalog.json like FilePathPart {+ WordID}
    """

    if 'db_needed' in args.args:
        db_needed = True

    if db_needed:
        db = _db(args)

    catalog = _get_catalog(args, 'soundfiles')

    # holds all desired FilePathParts+WordIDs
    desired_keys = set()

    # get desired extensions
    valid_ext = catalog.mimetypes.keys()
    desired_ext = list(set(args.args) & set(valid_ext))
    if len(desired_ext) == 0:
        desired_ext = list(valid_ext)
    else:
        # remove ext from args.args
        args.args = list(set(args.args) - set(valid_ext))

    if db_needed:
        # get desired keys via study names
        try:
            valid_studies = _get_all_study_names(db)
            desired_studies = list(set(args.args) & set(valid_studies))
            if len(desired_studies) > 0:
                # remove study names from args.args
                args.args = list(set(args.args) - set(desired_studies))
                q = " UNION ".join([
                    "SELECT DISTINCT FilePathPart AS f FROM Languages_%s" % (s) for s in desired_studies])
                for x in list(db(q)):
                    new_keys = [
                        SoundfileName(k) for k in catalog.get_soundfilenames(x['f'])]
                    if len(new_keys) == 0:
                        args.log.warning(
                            "Nothing found for %s in catalog - will be ignored" % (
                                x['f']))
                    desired_keys.update(new_keys)
        except ValueError as e:
            args.log.warning(e)
        except Exception as e:
            args.log.error("Check DB settings!")
            args.log.error(e)
            return

        # mapping LanguageIx -> FilePathPart
        if len(args.args) > 0:
            q = " UNION ".join([
                """SELECT DISTINCT
                    FilePathPart AS f, LanguageIx AS i
                   FROM Languages_%s""" % (s) for s in valid_studies])
            try:
                idx_map = {str(x['i']): x['f'] for x in list(db(q))}
            except Exception as e:
                args.log.error("Check DB settings!")
                args.log.error(e)
                return

            # parse LanguageIxs
            for i in args.args:
                if re.match(r"^\d{11,}$", i):
                    # remove found LanguageIx from args.args
                    args.args = list(set(args.args) - set([i]))
                    if i in idx_map.keys():  # LanguageIx ?
                        new_keys = [
                            SoundfileName(k) for k in catalog.get_soundfilenames(idx_map[i])]
                        if len(new_keys) == 0:
                            args.log.warning(
                                "No sounds for LanguageIx %s (%s) - will be ignored" % (
                                    i, idx_map[i]))
                        desired_keys.update(new_keys)
                    else:
                        args.log.warning("LanguageIx %s unknown - will be ignored" % (i))

    for i in args.args:
        if i in catalog:  # UID or SoundfileName?
            try: #SoundfileName
                desired_keys.add(SoundfileName(i))
            except ValueError: # UID
                try:
                    desired_keys.add(SoundfileName(catalog[i].metadata['name']))
                except ValueError:
                    args.log.warning('Path for {0} is not valid - will be skipped'.format(i))
        else:
            desired_keys.update(SoundfileName(k) for k in catalog.get_soundfilenames(i))

    args.log.info('{0} sound files selected'.format(len(desired_keys)))

    out_path = Path(out_path)
    if not out_path.exists():
        out_path.mkdir()

    desired_mimetypes = [catalog.mimetypes[ext] for ext in desired_ext]

    # pb = tqdm(total=len(desired_keys))
    for folder, sfns in groupby(sorted(desired_keys), lambda s: s.variety):
        args.log.info(' ... {0}'.format(folder))
        folder = out_path / folder
        if not folder.exists():
            try:
                folder.mkdir()
            except Exception as e:
                try:
                    folder.mkdir()
                except Exception as e:
                    args.log.warning(' ... cannot make folder {0}'.format(folder))
                    continue

        for obj in [catalog[sfn] for sfn in sfns]:
            # pb.update()
            for bs in catalog.matching_bitstreams(obj, mimetypes=desired_mimetypes):
                target = folder / bs.id
                if (not target.exists()) or md5(target) != bs.md5:
                    try:
                        urlretrieve(catalog.bitstream_url(obj, bs), str(target))
                    except Exception as e:
                        try:
                            urlretrieve(catalog.bitstream_url(obj, bs), str(target))
                        except Exception as e:
                            args.log.warning(' ... ... {0} should be checked'.format(obj.metadata['name']))
Beispiel #11
0
def run(args):
    ds = Dataset().cldf_reader()

    release_dir = args.out / '{0}_audio'.format(Dataset().id)
    zenodo_file_name = 'zenodo.json'

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']}
        audio = args.out / 'audio'
        audio.mkdir(exist_ok=True)

    if not args.update_zenodo:
        for row in tqdm.tqdm([r for r in ds['media.csv']]):
            if args.list:
                size[row['mimetype']] += int(row['size'])
                number.update([row['mimetype']])
            else:
                d = audio / f2c[row['Form_ID']]
                d.mkdir(exist_ok=True)
                url = ds.get_row_url('media.csv', row)
                target = d / '{}.{}'.format(row['ID'], url.split('.')[-1])
                if (not target.exists()) or md5(target) != row['ID']:
                    if (args.mimetype is None) or target.suffix.endswith(
                            args.mimetype):
                        create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k, str(number[k]), format_size(v)]))

    if args.create_release:
        assert audio.exists(), 'No folder "audio" found in {0}'.format(
            audio.resolve())

        release_dir.mkdir(exist_ok=True)

        args.log.info('creating audio ZIP archive per parameter folder ...')
        try:
            zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w',
                                   zipfile.ZIP_DEFLATED)
            fp = args.out
            for root, dirs, files in tqdm.tqdm(os.walk(audio)):
                for f in files:
                    if not f.startswith('.') and not f.startswith('__')\
                            and ((args.mimetype is None) or f.endswith(args.mimetype)):
                        zipf.write(os.path.join(root, f),
                                   os.path.relpath(os.path.join(root, f), fp))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        with jsonlib.update(release_dir / zenodo_file_name,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = Dataset().dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [contrib(p) for p in contributors]
            if COMMUNITIES:
                md['communities'] = [{
                    'id': community_id
                } for community_id in COMMUNITIES]
            md.update({
                'title':
                '{0} Audio Files'.format(Dataset().metadata.title),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'video',
                'version':
                VERSION,
                'related_identifiers': [
                    {
                        'scheme': 'doi',
                        'identifier': '10.5281/zenodo.4309141',
                        'relation': 'isPartOf'
                    },
                    {
                        'scheme':
                        'url',
                        'identifier':
                        '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX,
                                                  Dataset().id, VERSION),
                        'relation':
                        'isSupplementTo'
                    },
                ],
            })
            if Dataset().metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    Dataset().metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })
            md['description'] = html.escape(
                DESCRIPTION.format(
                    GITHUB_PREFIX,
                    Dataset().id,
                    Dataset().metadata.url if Dataset().metadata.url else '',
                    VERSION))

            license_md = ''
            if Dataset().metadata.zenodo_license:
                md['license'] = {'id': Dataset().metadata.zenodo_license}
                license_md = LISENCE.format(Dataset().metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                RELEASE_NOTE.format(md['title'], GITHUB_PREFIX,
                                    Dataset().id,
                                    Dataset().metadata.title, license_md))

    if args.update_zenodo:
        assert release_dir.exists()
        assert (release_dir / zenodo_file_name).exists()

        md = {}
        md.update(jsonlib.load(release_dir / zenodo_file_name))

        api_url = API_URL
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN)
        rec = api.record_from_id('{0}record/{1}'.format(
            zenodo_url, args.update_zenodo))
        args.log.info('  DOI:   ' + rec.metadata.doi)
        args.log.info('  Title: ' + rec.metadata.title)
        args.log.info('  Date:  ' + rec.metadata.publication_date)
        args.log.info('  Files: ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != zenodoclient.models.PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Beispiel #12
0
Datei: app.py Projekt: clld/clld
def includeme(config):
    """Upgrading:

    - register utilities "by hand", after config.include('clld.web.app')
    - add routes by hand (and remove these from the **kw passed to Configurator)

    :param config:
    :return:
    """
    #
    # now we exploit the default package layout as created via the CLLD scaffold:
    #
    # note: the following exploits the import time side effect of modifying the webassets
    # environment!
    root_package = config.root_package.__name__
    pkg_dir = Path(config.root_package.__file__).parent.resolve()
    maybe_import('%s.assets' % root_package, pkg_dir=pkg_dir)

    json_renderer = JSON()
    json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('json', json_renderer)

    jsonp_renderer = JSONP(param_name='callback')
    jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('jsonp', jsonp_renderer)

    config.set_request_factory(ClldRequest)
    config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery)
    config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig)
    config.registry.registerUtility(CldfConfig(), interfaces.ICldfConfig)

    # initialize the db connection
    engine = engine_from_config(config.registry.settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.bind = engine

    try:
        git_tag = git_describe(Path(pkg_dir).parent)
    except ValueError:  # pragma: no cover
        git_tag = None

    config.add_settings({
        'pyramid.default_locale_name': 'en',
        'clld.pkg': root_package,
        'clld.git_tag': git_tag,
        'clld.parameters': {}})
    if 'clld.files' in config.registry.settings:
        # deployment-specific location of static data files
        abspath = Path(config.registry.settings['clld.files']).resolve()
        config.add_settings({'clld.files': abspath})
        config.add_static_view('files', str(abspath))

    # event subscribers:
    config.add_subscriber(add_localizer, events.NewRequest)
    config.add_subscriber(init_map, events.ContextFound)
    config.add_subscriber(
        partial(
            add_renderer_globals,
            maybe_import('%s.util' % root_package, pkg_dir=pkg_dir)),
        events.BeforeRender)

    #
    # make it easy to register custom functionality
    #
    for name, func in {
        'register_utility': register_utility,
        'register_datatable': partial(register_cls, interfaces.IDataTable),
        'register_map': partial(register_cls, interfaces.IMap),
        'register_menu': register_menu,
        'register_resource': register_resource,
        'register_adapter': register_adapter,
        'register_adapters': register_adapters,
        'register_download': register_download,
        'register_staticresource': register_staticresource,
        'add_route_and_view': add_route_and_view,
        'add_settings_from_file': add_settings_from_file,
        'add_301': add_301,
        'add_410': add_410,
        'add_page': add_page,
        'register_resource_routes_and_views': register_resource_routes_and_views,
    }.items():
        config.add_directive(name, func)

    #
    # routes and views
    #
    config.add_static_view('clld-static', 'clld:web/static')
    config.add_static_view('static', '%s:static' % root_package)

    config.add_route_and_view('_js', '/_js', js, http_cache=3600)

    # add some maintenance hatches
    config.add_route_and_view('_raise', '/_raise', _raise)
    config.add_route_and_view('_ping', '/_ping', _ping, renderer='json')

    # sitemap support:
    config.add_route_and_view('robots', '/robots.txt', robots)
    config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex)
    config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap)
    config.add_route('resourcemap', '/resourcemap.json')
    config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp')
    config.add_route_and_view(
        'select_combination', '/_select_combination', select_combination)

    config.add_route_and_view('unapi', '/unapi', unapi)
    config.add_route_and_view('olac', '/olac', olac)

    config.add_settings_from_file(pkg_dir.joinpath('appconf.ini'))
    if not config.registry.settings.get('mako.directories'):
        config.add_settings({'mako.directories': ['clld:web/templates']})

    for rsc in RESOURCES:
        config.register_resource_routes_and_views(rsc)
        config.register_datatable(
            rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable))
        register_resource_adapters(config, rsc)

    # maps
    config.register_map('languages', Map)
    config.register_map('language', LanguageMap)
    config.register_map('parameter', ParameterMap)
    config.register_map('combination', CombinationMap)

    config.include('clld.web.adapters')

    for icon in ICONS:
        config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name)
    config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList)
    config.registry.registerUtility(MapMarker(), interfaces.IMapMarker)

    #
    # inspect default locations for views and templates:
    #
    home_comp = OrderedDict()
    for name, template in [
        ('introduction', False),
        ('about', False),
        ('terms', False),
        ('glossary', False),
        ('history', False),
        ('changes', False),
        ('credits', False),
        ('legal', True),
        ('download', True),
        ('contact', True),
        ('help', False),
    ]:
        home_comp[name] = template

    if pkg_dir.joinpath('templates').exists():
        for p in pkg_dir.joinpath('templates').iterdir():
            if p.stem in home_comp and p.suffix == '.mako':
                home_comp[p.stem] = True

    for name, template in home_comp.items():
        if template:
            config.add_page(name)

    config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]})

    if 'clld.favicon' not in config.registry.settings:
        favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'}
        # hard to test (in particular on travis) and without too much consequence
        # (and the consequences faced are easy to spot).
        if pkg_dir.joinpath('static', 'favicon.ico').exists():  # pragma: no cover
            favicon['clld.favicon'] = root_package + ':static/favicon.ico'
        config.add_settings(favicon)

    config.add_settings({
        'clld.favicon_hash': md5(abspath_from_asset_spec(
            config.registry.settings['clld.favicon']))})

    translation_dirs = ['clld:locale']
    if pkg_dir.joinpath('locale').exists():
        translation_dirs.append('%s:locale' % root_package)  # pragma: no cover
    config.add_translation_dirs(*translation_dirs)

    if pkg_dir.joinpath('static/publisher_logo.png').exists():  # pragma: no cover
        config.add_settings(
            {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package})

    if asbool(config.registry.settings.get('clld.pacific_centered_maps')):
        geojson.pacific_centered()

    v = maybe_import('%s.views' % root_package, pkg_dir=pkg_dir)
    if v:
        config.scan(v)  # pragma: no cover

    menuitems = config.registry.settings.get(
        'clld.menuitems_list',
        ['contributions', 'parameters', 'languages', 'contributors'])
    config.register_menu(('dataset', dict(label='Home')), *menuitems)

    config.include('pyramid_mako')

    for name in ['adapters', 'datatables', 'maps']:
        mod = maybe_import('%s.%s' % (root_package, name), pkg_dir=pkg_dir)
        if mod and hasattr(mod, 'includeme'):
            config.include(mod)

    config.register_download(CldfDownload(common.Dataset, root_package))
Beispiel #13
0
# coding: utf8
from __future__ import unicode_literals, print_function, division

from six.moves.urllib.request import urlretrieve

from clldutils.jsonlib import load
from clldutils.path import Path, md5

import glottolog3

DOWNLOAD_DIR = Path(glottolog3.__file__).parent.joinpath('static', 'download')

for rel, spec in load(DOWNLOAD_DIR.parent / 'downloads.json').items():
    d = DOWNLOAD_DIR / rel
    if not d.exists():
        d.mkdir()
    for bs in spec['bitstreams']:
        url = 'https://cdstar.shh.mpg.de//bitstreams/{0}/{1}'.format(
            spec['oid'], bs['bitstreamid'])
        target = d.joinpath(bs['bitstreamid'].replace('_', '-'))
        if (not target.exists()) or bs['checksum'] != md5(target):
            print('retrieving {0} {1}'.format(rel, target))
            urlretrieve(url, str(target))
Beispiel #14
0
def run(args):

    ds = get_dataset(args)
    ds_cldf = ds.cldf_reader()
    release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)

    if ds_cldf.get('media.csv', None) is None:  # pragma: no cover
        args.log.error('Dataset has no media.csv')
        raise ParserError
    if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
        args.log.error('Invalid passed DOI')
        raise ParserError
    if args.update_zenodo:
        if not release_dir.exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir))
            raise ParserError
        if not (release_dir / ZENODO_FILE_NAME).exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir / ZENODO_FILE_NAME))
            raise ParserError
        if args.create_release:
            args.log.error(
                'You cannot create the release and update zenodo at the same time.'
            )
            raise ParserError
    if args.create_release:
        if not args.parent_doi:
            args.log.error(
                'The corresponding DOI is required (via --parent-doi).')
            raise ParserError

    mime_types = None
    if args.mimetype:
        mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        media_dir = args.out / MEDIA
        media_dir.mkdir(exist_ok=True)
        media = []

    if not args.update_zenodo:
        used_file_extensions = set()
        with UnicodeWriter(media_dir /
                           INDEX_CSV if not args.list else None) as w:
            for i, row in enumerate(
                    tqdm.tqdm([r for r in ds_cldf['media.csv']],
                              desc='Getting {0} items'.format(MEDIA))):
                url = ds_cldf.get_row_url('media.csv', row)
                if isinstance(url, rfc3986.URIReference):
                    url = url.normalize().unsplit()
                    row['URL'] = url
                f_ext = url.split('.')[-1].lower()
                if args.debug and i > 500:
                    break
                if (mime_types is None) or f_ext in mime_types\
                        or any(row['mimetype'].startswith(x) for x in mime_types):
                    if args.list:
                        m = '{0} ({1})'.format(row['mimetype'], f_ext)
                        size[m] += int(row['size'])
                        number.update([m])
                    else:
                        used_file_extensions.add(f_ext.lower())
                        d = media_dir / row['ID'][:2]
                        d.mkdir(exist_ok=True)
                        fn = '.'.join([row['ID'], f_ext])
                        target = d / fn
                        row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
                        if i == 0:
                            w.writerow(row)
                        w.writerow(row.values())
                        media.append(target)
                        if (not target.exists()) or md5(target) != row['ID']:
                            _create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
        return

    # Waiting for the download threads to finish
    if 'download_threads' in globals():
        for t in download_threads:
            t.join()

    if args.create_release:
        assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(
            MEDIA, media_dir.resolve())

        release_dir.mkdir(exist_ok=True)

        media.append(media_dir / INDEX_CSV)

        try:
            zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)),
                                   'w', zipfile.ZIP_DEFLATED)
            fp = args.out
            for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
                zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def _contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        version_v = git_describe('.').split('-')[0]
        version = version_v.replace('v', '')
        git_url = [r for r in ds.repo.repo.remotes
                   if r.name == 'origin'][0].url.replace('.git', '')
        with jsonlib.update(release_dir / ZENODO_FILE_NAME,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = ds.dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [_contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [_contrib(p) for p in contributors]
            communities = [r["identifier"] for r in md.get("communities", [])] + \
                [c.strip() for c in nfilter(args.communities.split(','))] + \
                COMMUNITIES
            if communities and not args.debug:
                md['communities'] = [{
                    "identifier": community_id
                } for community_id in sorted(set(communities))]
            md.update({
                'title':
                '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'dataset',
                'publication_date':
                datetime.today().strftime('%Y-%m-%d'),
                'version':
                version,
                'related_identifiers': [
                    {
                        'scheme': 'url',
                        'identifier':
                        '{0}/tree/{1}'.format(git_url, version_v),
                        'relation': 'isSupplementTo'
                    },
                ],
            })
            if args.parent_doi:
                md['related_identifiers'].append({
                    'scheme': 'doi',
                    'identifier': args.parent_doi,
                    'relation': 'isPartOf'
                })
                supplement_to = " - Supplement to dataset " \
                                "<a href='https://doi.org/{0}'>{1}</a> ".format(
                    args.parent_doi, ds.metadata.title)  # noqa: E122
            if ds.metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    ds.metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })

            formats = ', '.join(sorted(used_file_extensions))
            descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else ''
            online_url, online = '', ''
            if ds.metadata.url:
                online_url = ds.metadata.url
                online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format(
                    online_url)
            md['description'] = html.escape(
                DESCRIPTION.format(
                    url=online_url,
                    formats=' ({0})'.format(formats) if formats else '',
                    title=md['title'],
                    supplement_to=supplement_to,
                    descr=descr,
                    online=online))

            license_md = ''
            if ds.metadata.zenodo_license:
                md['license'] = {'id': ds.metadata.zenodo_license}
                license_md = LICENCE.format(ds.metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                README.format(
                    title=md['title'],
                    doi='https://doi.org/{0}'.format(args.parent_doi),
                    ds_title=ds.metadata.title,
                    license=license_md,
                    formats=' ({0})'.format(formats) if formats else '',
                    media=MEDIA,
                    index=INDEX_CSV))

    if args.update_zenodo:

        md = {}
        md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME))

        if args.debug:
            api_url = API_URL_SANDBOX
            access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
        else:
            api_url = API_URL
            access_token = ACCESS_TOKEN
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=access_token)
        try:
            rec = api.record_from_id('{0}record/{1}'.format(
                zenodo_url, args.update_zenodo))
        except Exception as e:
            args.log.error(
                'Check connection and credentials for accessing Zenodo.\n{0}'.
                format(e))
            return
        latest_version = rec.links['latest'].split('/')[-1]
        if latest_version != args.update_zenodo:
            args.log.warn(
                'Passed deposit ID does not refer to latest version {0}!'.
                format(latest_version))
        args.log.info('  DOI:     ' + rec.metadata.doi)
        args.log.info('  Title:   ' + rec.metadata.title)
        args.log.info('  Version: ' + rec.metadata.version)
        args.log.info('  Date:    ' + rec.metadata.publication_date)
        args.log.info('  Files:   ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')
Beispiel #15
0
 def exists(self, repos: 'GEOROC') -> bool:
     """
     Checks whether the specified file exists with correct checksum in the repository.
     """
     p = repos.csvdir / self.name
     return p.exists() and md5(p) == self.md5
Beispiel #16
0
def run(args):
    #
    # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams
    # rather than creating a new object!
    #
    dlfname = args.pkg_dir.joinpath('static', 'downloads.json')
    downloads = load(dlfname)
    release = args.version
    title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads')
    with args.catalog_class(args.catalog, args.url, args.user,
                            args.pwd) as cat:
        #
        # FIXME: there must be a way to overwrite old releases in case of bugfixes!
        #
        if release in downloads:
            print('adding bitstreams to {0}'.format(downloads[release]['oid']))
            # This is a bugfix release, we don't have to create a new object on CDSTAR!
            obj = cat.api.get_object(uid=downloads[release]['oid'])
        else:
            obj = cat.api.get_object()
            obj.metadata = {
                "creator":
                "pycdstar",
                "title":
                "glottolog %s - downloads" % release,
                "description":
                "Custom downloads for release %s of "
                "[Glottolog](http://glottolog.org)" % release,
            }
        bitstreams = obj.bitstreams[:]
        for fname in args.pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                bsname = fname.name.replace('-', '_')
                bitstream, skip = None, False
                for bitstream in bitstreams:
                    if bitstream.id == bsname:
                        break
                else:
                    bitstream = None
                if bitstream:
                    if bitstream._properties['checksum'] != md5(fname):
                        bitstream.delete()
                    else:
                        skip = True
                        print('skipping {0}'.format(fname.name))
                if not skip:
                    print(fname.name)
                    obj.add_bitstream(fname=fname.as_posix(), name=bsname)
        obj.read()
        cat.add(obj, update=True)

    with update(dlfname,
                default=collections.OrderedDict(),
                indent=4,
                sort_keys=True) as downloads:
        for oid, spec in load(args.catalog).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if (match.group('version') not in downloads
                        ) or match.group('version') == release:
                        args.log.info('update info for release {0}'.format(
                            match.group('version')))
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(dlfname))
    args.log.info('{0}'.format(args.catalog))
Beispiel #17
0
    def test_md5(self):
        from clldutils.path import md5

        self.assertIsNotNone(re.match('[a-f0-9]{32}$', md5(__file__)))
Beispiel #18
0
def test_md5():
    from clldutils.path import md5

    assert re.match('[a-f0-9]{32}$', md5(__file__))
Beispiel #19
0
def cdstar(args):
    try:
        from cdstarcat.catalog import Catalog
    except ImportError:
        args.log.error('pip install cdstarcat')
        return

    #
    # FIXME: look up oid for release in downloads.json! if it exists, replace the bitstreams
    # rather than creating a new object!
    #
    dlfname = args.pkg_dir.joinpath('static', 'downloads.json')
    downloads = load(dlfname)
    release = args.args[0]
    title_pattern = re.compile('glottolog (?P<version>[0-9.]+) - downloads')
    with Catalog(
            Path(os.environ['CDSTAR_CATALOG']),
            cdstar_url=os.environ['CDSTAR_URL'],
            cdstar_user=os.environ['CDSTAR_USER'],
            cdstar_pwd=os.environ['CDSTAR_PWD']) as cat:
        #
        # FIXME: there must be a way to overwrite old releases in case of bugfixes!
        #
        if release in downloads:
            # This is a bugfix release, we don't have to create a new object on CDSTAR!
            obj = cat.api.get_object(uid=downloads[release]['oid'])
        else:
            obj = cat.api.get_object()
            obj.metadata = {
                "creator": "pycdstar",
                "title": "glottolog %s - downloads" % release,
                "description": "Custom downloads for release %s of "
                               "[Glottolog](http://glottolog.org)" % release,
            }
        bitstreams = obj.bitstreams[:]
        for fname in args.pkg_dir.joinpath('static', 'download').iterdir():
            if fname.is_file() and not fname.name.startswith('.'):
                bsname = fname.name.replace('-', '_')
                bitstream, skip = None, False
                for bitstream in bitstreams:
                    if bitstream.id == bsname:
                        break
                else:
                    bitstream = None
                if bitstream:
                    if bitstream._properties['checksum'] != md5(fname):
                        bitstream.delete()
                    else:
                        skip = True
                        print('skipping {0}'.format(fname.name))
                if not skip:
                    print(fname.name)
                    obj.add_bitstream(fname=fname.as_posix(), name=bsname)
        cat.add(obj, update=True)

    with update(dlfname, default=collections.OrderedDict(), indent=4, sort_keys=True) as downloads:
        for oid, spec in load(Path(os.environ['CDSTAR_CATALOG'])).items():
            if 'metadata' in spec and 'title' in spec['metadata']:
                match = title_pattern.match(spec['metadata']['title'])
                if match:
                    if (match.group('version') not in downloads) or match.group('version') == release:
                        args.log.info('update info for release {0}'.format(match.group('version')))
                        spec['oid'] = oid
                        downloads[match.group('version')] = spec
    args.log.info('{0} written'.format(dlfname))
    args.log.info('{0}'.format(os.environ['CDSTAR_CATALOG']))