Example #1
0
    def test_remove(self):
        from clldutils.path import remove

        self.assertRaises(OSError, remove, self.tmp_path('nonexistingpath'))
        tmp = self.make_file('test')
        self.assertTrue(tmp.exists())
        remove(tmp)
        self.assertFalse(tmp.exists())
Example #2
0
File: util.py Project: clld/clld
def safe_overwrite(fname):
    fname = Path(fname)
    if not fname.parent.exists():
        fname.parent.mkdir()
    assert fname.parent.exists()
    tmp = fname.parent
    while tmp.exists():
        tmp = fname.parent.joinpath('%s.%s' % (fname.name, random_string(6)))
    yield tmp
    if fname.exists():
        remove(fname)
    move(tmp, fname)
Example #3
0
 def test_generate_extract(self):
     xml = self.tmp_path('test.xml')
     self._run_main('-v -o {0} {1}'.format(xml.as_posix(), config_path('basic')))
     self.assertTrue(xml.exists())
     # Overwriting existing files must be specified explicitely:
     self._run_main('-o {0} {1}'.format(
         xml.as_posix(), config_path('basic')), status=4)
     self._run_main('--overwrite -o {0} {1}'.format(
         xml.as_posix(), config_path('basic')), status=0)
     tcfg = Path('beastling_test.conf')
     self._run_main('--extract {0}'.format(xml.as_posix()))
     self.assertTrue(tcfg.exists())
     remove(tcfg)
Example #4
0
def recode(args):
    """Assign a new glottocode to an existing languoid.

    glottolog recode <code>
    """
    lang = find_languoid(glottocode=args.args[0])
    if not lang:
        raise ParserError('languoid not found')
    lang.id = Glottocode.from_name(lang.name)
    new_dir = lang.dir.parent.joinpath(lang.id)
    copytree(lang.dir, new_dir)
    lang.write_info(new_dir)
    remove(new_dir.joinpath('%s.ini' % args.args[0]))
    rmtree(lang.dir)
    print("%s -> %s" % (args.args[0], lang.id))
Example #5
0
    def test_Matrix(self):
        from wals3.adapters import Matrix

        p = Path(mktemp())
        assert not p.exists()

        class TestMatrix(Matrix):
            def abspath(self, req):
                return p

            def query(self, req):
                return Matrix.query(self, req).filter(Language.pk < 100)

        m = TestMatrix(Language, "wals3", description="Feature values CSV")
        m.create(self.env["request"], verbose=False)
        assert p.exists()
        remove(p)
Example #6
0
    def test_extractor(self):
        config = self.make_cfg(
            [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")])
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        self.assertTrue(bool(self._extract(xmlfile)))

        config = self.make_cfg({
            'admin': {'basename': 'abcdefg'},
            'model': {
                'model': 'mk',
                'data': data_path('basic.csv').as_posix()}})
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        beastling.extractor.extract(xmlfile)
        p = Path('abcdefg.conf')
        self.assertTrue(p.exists())
        cfg = INI(interpolation=None)
        cfg.read(p.as_posix())
        remove(p)
        self.assertEqual(cfg['admin']['basename'], 'abcdefg')
        self.assertEqual(cfg['model']['model'], 'mk')

        fname = self.tmp.joinpath('test.xml')
        datafile = self.tmp.joinpath(('test.csv'))
        self.assertFalse(datafile.exists())
        with fname.open('w', encoding='utf8') as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
       beastling.extractor._config_file_str,
       beastling.extractor._data_file_str,
       datafile.as_posix()))
        res = self._extract(fname)
        self.assertIn(datafile.name, ''.join(res))
Example #7
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(GzipFile(
                    filename=Path(tmp.stem).stem, fileobj=tmp.open('wb')
            )) as fp:
                self.before(req, fp)
                for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (
                            len(req.dataset.name)
                            + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Example #8
0
def dump(out, version, all_langs, identifiers):
    if out.exists():
        for p in out.iterdir():
            remove(p)
    else:
        out.mkdir()

    langs = all_langs[version].values()
    langs_by_pk = {l.pk: l for l in langs}

    children = {
        pk: list(c)
        for pk, c in groupby(sorted(langs, key=lambda l: l.fpk or 0), lambda l: l.fpk)}

    for lang in langs:
        ancestors, fpk = [], lang.fpk
        while fpk and fpk in langs_by_pk:
            ancestors.append(langs_by_pk[fpk])
            fpk = langs_by_pk[fpk].fpk

        versions = [
            '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'.format(lang.id)]
        for v in sorted(all_langs.keys()):
            if v != version:
                if lang.id in all_langs[v]:
                    versions.append(all_langs[v][lang.id].cross_version_link)
        clf = [link_list(children.get(lang.pk, []))]
        clf.append(lang.text)
        clf.extend(a.link for a in ancestors)
        write_text(
            out.joinpath('{0}.html'.format(lang.id)),
            T.render_unicode(
                version=version,
                lang=lang,
                clf=reduce(wrap, clf) if not lang.replacements else '',
                versions=versions,
                identifiers=identifiers.get(lang.pk, []),
                replacements=[all_langs[version][lid].link for lid in lang.replacements
                              if lid in all_langs[version]],
                wrap=wrap,
                link_list=link_list,
            )
        )
Example #9
0
def test_Dataset_validate(tmpdir, mocker):
    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds.write(ValueTable=[])
    values = tmpdir / 'new' / 'values.csv'
    assert values.check()
    remove(str(values))
    log = mocker.Mock()
    assert not ds.validate(log=log)
    assert log.warn.called

    ds.write(ValueTable=[])
    assert ds.validate()

    ds['ValueTable'].tableSchema.columns = []
    with pytest.raises(ValueError):
        ds.validate()
    assert not ds.validate(log=mocker.Mock())
    ds.tablegroup.tables = []
    with pytest.raises(ValueError):
        ds.validate()

    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds.add_component('LanguageTable')
    ds.write(ValueTable=[], LanguageTable=[])
    assert ds.validate()

    # test violation of referential integrity:
    ds.write(ValueTable=[{'ID': '1', 'Value': '1', 'Language_ID': 'lid', 'Parameter_ID': 'pid'}], LanguageTable=[])
    assert not ds.validate(log=mocker.Mock())

    # test an invalid CLDF URL:
    ds['LanguageTable'].common_props['dc:conformsTo'] = 'http://cldf.clld.org/404'
    with pytest.raises(ValueError):
        ds.validate()

    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds['ValueTable'].get_column('Source').propertyUrl = URITemplate(
        'http://cldf.clld.org/404')
    ds.write(ValueTable=[])
    with pytest.raises(ValueError):
        ds.validate()
Example #10
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if self.name.exists():
         remove(self.name)
Example #11
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)
Example #12
0
File: util.py Project: clld/clld
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Example #13
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p)

        language_url_pattern = self.route_url_pattern(req, 'language')

        with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
            tables = []
            for param in DBSession.query(Parameter).options(joinedload(Parameter.domain)):
                fname = '%s-%s.csv' % (req.dataset.id, param.id)
                zipfile.writestr(fname, self.get_values(param, language_url_pattern))
                tables.append({
                    '@type': 'Table',
                    'url': fname,
                    'notes': [
                        {
                            '@id': req.resource_url(param),
                            'dc:identifier': param.id,
                            'dc:title': param.name,
                            'dc:description': param.description or ''}] + [
                        {
                            '@type': 'DomainElement',
                            'name': de.name,
                            'description': de.description,
                            'numeric': de.number
                        } for de in param.domain
                    ],
                })

            md = CsvmJsonAdapter.csvm_basic_doc(req, tables=tables)
            md.update({
                '@type': 'TableGroup',
                'dc:language': list(self.get_languages(req, language_url_pattern)),
                'tableSchema': {
                    "columns": [
                        {
                            "name": "ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Language_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Parameter_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Contribution_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Value",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Source",
                            "datatype": "string",
                        },
                        {
                            "name": "Comment",
                            "datatype": "string",
                        },
                    ],
                    "primaryKey": "ID",
                    'aboutUrl': self.route_url_pattern(req, 'value', '{ID}'),
                },
            })
            zipfile.writestr(
                '%s.csv-metadata.json' % req.dataset.id, json.dumps(md, indent=4))
            bib = Database([
                rec.bibtex() for rec in DBSession.query(Source).order_by(Source.name)])
            zipfile.writestr('%s.bib' % req.dataset.id, ('%s' % bib).encode('utf8'))
            zipfile.writestr(
                'README.txt',
                README.format(
                    req.dataset.name,
                    '=' * (
                        len(req.dataset.name)
                        + len(' data download')),
                    req.dataset.license,
                    TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)