Exemple #1
0
def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples}
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath(
        'static', 'download', '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))
Exemple #2
0
def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env['request'].dataset
    dump_dir = args.data_file('dumps')
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp:
        fp.write(freeze_readme(dataset, args.env['request']))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath('%s.csv' % table.name)
        if with_history or not table.name.endswith('_history'):
            _freeze(table, csv)

        if csv.exists():
            csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(
                csvm, args.env['request'], [(col.name, col) for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version  # pragma: no cover
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(
            as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open('rb') as fp:
                    zipfile.writestr(f.name, fp.read())
Exemple #3
0
def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env['request'].dataset
    dump_dir = args.data_file('dumps')
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp:
        fp.write(freeze_readme(dataset, args.env['request']))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath('%s.csv' % table.name)
        if with_history or not table.name.endswith('_history'):
            _freeze(table, csv)

        if csv.exists():
            csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(csvm, args.env['request'],
                                           [(col.name, col)
                                            for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(as_posix(args.data_file('..', 'data.zip')), 'w',
                 ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open('rb') as fp:
                    zipfile.writestr(f.name, fp.read())
Exemple #4
0
def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env["request"].dataset
    dump_dir = args.data_file("dumps")
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath("README.txt").open("w", encoding="utf8") as fp:
        fp.write(freeze_readme(dataset, args.env["request"]))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath("%s.csv" % table.name)
        if with_history or not table.name.endswith("_history"):
            _freeze(table, csv)

        if csv.exists():
            csvm = "%s.%s" % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(csvm, args.env["request"], [(col.name, col) for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(as_posix(args.data_file("..", "data.zip")), "w", ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open("rb") as fp:
                    zipfile.writestr(f.name, fp.read())
Exemple #5
0
def rmtree(d, **kw):
    d = as_posix(d)
    for path in (os.path.join(d, f) for f in os.listdir(d)):
        if os.path.isdir(path):
            rmtree(path)
        else:
            os.unlink(path)
    os.rmdir(d)
Exemple #6
0
def download_and_unpack_zipfiles(url, dataset, *paths):
    """Download zipfiles and immediately unpack the content"""
    with TemporaryDirectory() as tmpdir:
        urlretrieve(url, tmpdir.joinpath('ds.zip').as_posix())
        with zipfile.ZipFile(tmpdir.joinpath('ds.zip').as_posix()) as zipf:
            for path in paths:
                zipf.extract(as_posix(path), path=tmpdir.as_posix())
                copy(tmpdir.joinpath(path), dataset.raw)
Exemple #7
0
def iterentries(filename, encoding=None):
    encoding = encoding or 'utf8'
    with memorymapped(as_posix(filename)) as source:
        try:
            for entrytype, (bibkey, fields) in iterentries_from_text(source, encoding):
                yield entrytype, (bibkey, fields)
        except PybtexSyntaxError as e:  # pragma: no cover
            debug_pybtex(source, e)
Exemple #8
0
def rmtree(d, **kw):
    """More performant way to remove large directory structures."""
    d = as_posix(d)
    for path in (os.path.join(d, f) for f in os.listdir(d)):
        if os.path.isdir(path):
            rmtree(path)
        else:
            os.unlink(path)
    os.rmdir(d)
Exemple #9
0
def rmtree(d, **kw):
    """More performant way to remove large directory structures."""
    d = as_posix(d)
    for path in (os.path.join(d, f) for f in os.listdir(d)):
        if os.path.isdir(path):
            rmtree(path)
        else:
            os.unlink(path)
    os.rmdir(d)
Exemple #10
0
def load(path, **kw):
    """python 2 + 3 compatible version of json.load.

    :param kw: Keyword parameters are passed to json.load
    :return: The python object read from path.
    """
    _kw = {}
    if PY3:  # pragma: no cover
        _kw['encoding'] = 'utf8'
    with open(as_posix(path), **_kw) as fp:
        return json.load(fp, **kw)
Exemple #11
0
def load(path, **kw):
    """python 2 + 3 compatible version of json.load.

    :param kw: Keyword parameters are passed to json.load
    :return: The python object read from path.
    """
    _kw = {}
    if PY3:  # pragma: no cover
        _kw['encoding'] = 'utf-8'
    with open(as_posix(path), **_kw) as fp:
        return json.load(fp, **kw)
Exemple #12
0
def unfreeze_func(args, engine=None):
    try:
        importlib.import_module(args.module.__name__)
    except ImportError:
        pass  # pragma: no cover
    engine = engine or DBSession.get_bind()
    data_dir = Path(mkdtemp())

    with ZipFile(as_posix(args.module_dir.joinpath('..', 'data.zip'))) as fp:
        fp.extractall(as_posix(data_dir))

    db_version = None
    for table in Base.metadata.sorted_tables:
        csv = data_dir.joinpath('%s.csv' % table.name)
        if csv.exists():
            db_version = load(table, csv, engine)

    if db_version:
        set_alembic_version(engine, db_version)  # pragma: no cover

    rmtree(data_dir)
Exemple #13
0
def unfreeze_func(args, engine=None):
    try:
        importlib.import_module(args.module.__name__)
    except ImportError:
        pass  # pragma: no cover
    engine = engine or DBSession.get_bind()
    data_dir = Path(mkdtemp())

    with ZipFile(as_posix(args.module_dir.joinpath('..', 'data.zip'))) as fp:
        fp.extractall(as_posix(data_dir))

    db_version = None
    for table in Base.metadata.sorted_tables:
        csv = data_dir.joinpath('%s.csv' % table.name)
        if csv.exists():
            db_version = load(table, csv, engine)

    if db_version:
        set_alembic_version(engine, db_version)

    rmtree(data_dir)
Exemple #14
0
def dump(obj, path, **kw):
    """python 2 + 3 compatible version of json.dump.

    :param obj: The object to be dumped.
    :param path: The path of the JSON file to be written.
    :param kw: Keyword parameters are passed to json.dump
    """
    _kw = dict(mode='w')
    if PY3:  # pragma: no cover
        _kw['encoding'] = 'utf8'
    with open(as_posix(path), **_kw) as fp:
        return json.dump(obj, fp, **kw)
Exemple #15
0
def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk),
                                  n=10000,
                                  verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {
        'path': as_posix(tmp),
        'resources': count_rsc,
        'triples': count_triples
    }
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath('static', 'download',
                                        '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))
Exemple #16
0
    def write(self, filename, encoding='utf-8'):
        """Write the list of entries to a file.

        :param filename:
        :param encoding:
        :return:
        """
        if isinstance(filename, Path):
            filename = as_posix(filename)
        with io.open(filename, 'w', encoding=encoding) as fp:
            for entry in self:
                fp.write(entry.__unicode__())
                fp.write('\n\n')
Exemple #17
0
def xls2csv(fname, outdir=None):
    res = {}
    outdir = outdir or fname.parent
    wb = xlrd.open_workbook(as_posix(fname))
    for sname in wb.sheet_names():
        sheet = wb.sheet_by_name(sname)
        if sheet.nrows:
            path = outdir.joinpath(fname.stem + '.' +
                                   slug(sname, lowercase=False) + '.csv')
            with UnicodeWriter(path) as writer:
                for i in range(sheet.nrows):
                    writer.writerow([col.value for col in sheet.row(i)])
            res[sname] = path
    return res
Exemple #18
0
    def languoids(self, ids=None, maxlevel=models.Level.dialect):
        nodes = {}

        for dirpath, dirnames, filenames in os.walk(as_posix(self.tree)):
            dp = Path(dirpath)
            if dp.name in nodes and nodes[dp.name][2] > maxlevel:
                del dirnames[:]

            for dirname in dirnames:
                if ids is None or dirname in ids:
                    lang = languoids.Languoid.from_dir(dp.joinpath(dirname),
                                                       nodes=nodes)
                    if lang.level <= maxlevel:
                        yield lang
Exemple #19
0
def parse(filename, encoding, entry_sep, entry_prefix):
    if isinstance(filename, Path):
        filename = as_posix(filename)

    # we cannot use codecs.open, because it does not understand mode U.
    with open(filename, 'rU', encoding=encoding) as fp:
        content = fp.read()

    for block in content.split(entry_sep):
        if block.strip():
            block = entry_prefix + block
        else:
            continue  # pragma: no cover
        yield [(k, v.strip()) for k, v in marker_split(block.strip()) if v.strip()]
Exemple #20
0
    def write(self, filename, encoding='utf8'):
        """
        Write the list of entries to a file.

        :param filename:
        :param encoding:
        :return:
        """
        if isinstance(filename, Path):
            filename = as_posix(filename)
        with open(filename, 'w', encoding=encoding) as fp:
            for entry in self:
                fp.write(entry.__unicode__())
                fp.write('\n\n')
Exemple #21
0
def parse(filename, encoding, entry_sep, entry_prefix, keep_empty=False):
    if isinstance(filename, Path):
        filename = as_posix(filename)

    with io.open(filename, 'r', encoding=encoding, newline=None) as fp:
        content = fp.read()

    for block in content.split(entry_sep):
        if block.strip():
            block = entry_prefix + block
        else:
            continue  # pragma: no cover
        yield [(k, v.strip()) for k, v in marker_split(block.strip())
               if v.strip() or keep_empty]
Exemple #22
0
def parse(filename, encoding, entry_sep, entry_prefix, keep_empty=False):
    if isinstance(filename, Path):
        filename = as_posix(filename)

    # we cannot use codecs.open, because it does not understand mode U.
    with io.open(filename, 'rU', encoding=encoding) as fp:
        content = fp.read()

    for block in content.split(entry_sep):
        if block.strip():
            block = entry_prefix + block
        else:
            continue  # pragma: no cover
        yield [(k, v.strip()) for k, v in marker_split(block.strip())
               if v.strip() or keep_empty]
Exemple #23
0
    def download_and_unpack(self, url, *paths, **kw):
        """
        Download a zipfile and immediately unpack selected content.

        :param url:
        :param paths:
        :param kw:
        :return:
        """
        with self.temp_download(url, 'ds.zip', log=kw.pop('log',
                                                          None)) as zipp:
            with TemporaryDirectory() as tmpdir:
                with zipfile.ZipFile(zipp.as_posix()) as zipf:
                    for path in paths:
                        zipf.extract(as_posix(path), path=tmpdir.as_posix())
                        copy(tmpdir.joinpath(path), self)
Exemple #24
0
def dump(obj, path, **kw):
    """Python 2 + 3 compatible version of json.dump.

    :param obj: The object to be dumped.
    :param path: The path of the JSON file to be written.
    :param kw: Keyword parameters are passed to json.dump
    """
    open_kw = {'mode': 'w'}
    if PY3:  # pragma: no cover
        open_kw['encoding'] = 'utf-8'

    # avoid indented lines ending with ", " on PY2
    if kw.get('indent') and kw.get('separators') is None:
        kw['separators'] = (',', ': ')

    with open(as_posix(path), **open_kw) as fp:
        return json.dump(obj, fp, **kw)
Exemple #25
0
def read_comments(filename):
    parser = CommentParser.get_parser()
    with open(as_posix(filename), "rb") as fp:
        parser.feed(fp.read())
    return [e for e in parser.close() if e.tag == ET.Comment]
Exemple #26
0
def save(entries, filename, sortkey, encoding='utf8'):
    with io.open(as_posix(filename), 'w', encoding=encoding,
                 errors='strict') as fd:
        dump(entries, fd, sortkey, encoding, None)
Exemple #27
0
 def __init__(self, fname, mode='r'):
     ZipFile.__init__(
         self, as_posix(fname), mode=mode, compression=ZIP_DEFLATED, allowZip64=True)
Exemple #28
0
 def __init__(self, fname, mode='r', **kwargs):
     for k, v in iteritems(self._init_defaults):
         kwargs.setdefault(k, v)
     super(ZipArchive, self).__init__(as_posix(fname), mode=mode, **kwargs)
Exemple #29
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)
Exemple #30
0
def read_comments(filename):
    parser = CommentParser.get_parser()
    with open(as_posix(filename), "rb") as fp:
        parser.feed(fp.read())
    return [e for e in parser.close() if e.tag == ET.Comment]
Exemple #31
0
def save(entries, filename, sortkey, encoding='utf-8', normalize='NFC'):
    with io.open(as_posix(filename), 'w', encoding=encoding,
                 errors='strict') as fd:
        dump(entries, fd, sortkey, normalize)
Exemple #32
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Exemple #33
0
def test_as_posix():
    from clldutils.path import as_posix, Path

    with pytest.raises(ValueError):
        as_posix(5)
    assert as_posix('.') == as_posix(Path('.'))
Exemple #34
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['volumeInfo']['title'] + ' ' +
                           item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info(
                    '------- %s -> %s' %
                    (source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (item['volumeInfo']['title'],
                                    item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' +
                             quote_plus(source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Exemple #35
0
def check(filename, encoding=None):
    parser = CheckParser(encoding=encoding)
    parser.parse_file(as_posix(filename))
    return parser.error_count
Exemple #36
0
    def test_as_posix(self):
        from clldutils.path import as_posix, Path

        self.assertRaises(ValueError, as_posix, 5)
        self.assertEquals(as_posix('.'), as_posix(Path('.')))