def main(): logging.basicConfig() observer = slob.SimpleTimingObserver() args = parse_args() outname = args.output_file if outname is None: noext = basename_notext(args.input_file) outname = os.path.extsep.join((noext, 'slob')) def p(s): sys.stdout.write(s) sys.stdout.flush() with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size * 1024, observer=observer) as slb: observer.begin('all') observer.begin('content') #create tags slb.tag('label', '') slb.tag('license.name', '') slb.tag('license.url', '') slb.tag('source', os.path.basename(args.input_file)) slb.tag('uri', '') slb.tag('copyright', '') slb.tag('created.by', args.created_by) input_file = os.path.expanduser(args.input_file) tei = TEI(input_file) content_dir = os.path.dirname(__file__) slob.add_dir(slb, content_dir, include_only={'js', 'css'}, prefix='~/') print('Adding content...') for i, item in enumerate(tei): if i % 100 == 0 and i: p('.') if i % 5000 == 0 and i: p(' {}\n'.format(i)) if isinstance(item, Tag): slb.tag(item.name, item.value) else: slb.add(item.text, *item.keys, content_type=item.type) edition = None with slob.open(outname) as s: edition = s.tags.get('edition') if edition: noext, ext = os.path.splitext(outname) newname = '{noext}-{edition}{ext}'.format(noext=noext, edition=edition, ext=ext) os.rename(outname, newname) print('\nAll done in %s\n' % observer.end('all'))
def main(): logging.basicConfig() observer = slob.SimpleTimingObserver() args = parse_args() outname = args.output_file if outname is None: noext = basename_notext(args.input_file) outname = os.path.extsep.join((noext, 'slob')) def p(s): sys.stdout.write(s) sys.stdout.flush() with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size*1024, observer=observer) as slb: observer.begin('all') observer.begin('content') #create tags slb.tag('label', '') slb.tag('license.name', '') slb.tag('license.url', '') slb.tag('source', os.path.basename(args.input_file)) slb.tag('uri', '') slb.tag('copyright', '') slb.tag('created.by', args.created_by) input_file = os.path.expanduser(args.input_file) tei = TEI(input_file) content_dir = os.path.dirname(__file__) slob.add_dir(slb, content_dir, include_only={'js', 'css'}, prefix='~/') print('Adding content...') for i, item in enumerate(tei): if i % 100 == 0 and i: p('.') if i % 5000 == 0 and i: p(' {}\n'.format(i)) if isinstance(item, Tag): slb.tag(item.name, item.value) else: slb.add(item.text, *item.keys, content_type=item.type) edition = None with slob.open(outname) as s: edition = s.tags.get('edition') if edition: noext, ext = os.path.splitext(outname) newname = '{noext}-{edition}{ext}'.format(noext=noext, edition=edition, ext=ext) os.rename(outname, newname) print('\nAll done in %s\n' % observer.end('all'))
def main(): logging.basicConfig() observer = slob.SimpleTimingObserver() args = parse_args() outname = args.output_file basename = os.path.basename(args.input_file) noext = basename if outname is None: while True: noext, _ext = os.path.splitext(noext) if not _ext: break outname = os.path.extsep.join((noext, 'slob')) def p(s): sys.stdout.write(s) sys.stdout.flush() with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size*1024, observer=observer) as slb: observer.begin('all') observer.begin('content') #create tags slb.tag('label', '') slb.tag('license.name', '') slb.tag('license.url', '') slb.tag('source', basename) slb.tag('uri', '') slb.tag('copyright', '') slb.tag('created.by', args.created_by) xdxf = XDXF(make_input(args.input_file), skip_article_title=args.skip_article_title, remove_newline=args.remove_newline) content_dir = os.path.dirname(__file__) slob.add_dir(slb, content_dir, include_only={'js', 'css'}, prefix='~/') print('Adding content...') for i, item in enumerate(xdxf): if i % 100 == 0 and i: p('.') if i % 5000 == 0 and i: p(' {}\n'.format(i)) if isinstance(item, Tag): slb.tag(item.name, item.value) else: slb.add(item.text, *item.keys, content_type=item.type) print('\nAll done in %s\n' % observer.end('all'))
def main(): def p(text): sys.stdout.write(text) sys.stdout.flush() times = {} def begin(name): times[name] = time.time() def end(name): t0 = times.pop(name) dt = timedelta(seconds=int(time.time() - t0)) return dt def observer(e): if e.name == 'begin_finalize': p('\nFinished adding content in %s' % end('content')) p('\nFinalizing...') begin('finalize') if e.name == 'end_finalize': p('\nFinalized in %s' % end('finalize')) elif e.name == 'begin_resolve_aliases': p('\nResolving aliases...') begin('aliases') elif e.name == 'end_resolve_aliases': p('\nResolved aliases in %s' % end('aliases')) elif e.name == 'begin_sort': p('\nSorting...') begin('sort') elif e.name == 'end_sort': p(' sorted in %s' % end('sort')) args = parse_args() outname = args.output_file if outname is None: basename = os.path.basename(args.couch_url) noext, _ext = os.path.splitext(basename) outname = os.path.extsep.join((noext, args.compression, 'slob')) with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size*1024, observer=observer) as slb: begin('content') begin('all') slb.tag('license.name', args.license_name) slb.tag('license.url', args.license_url) slb.tag('created.by', args.created_by) slb.tag('copyright', '') current_dir = os.getcwd() os.chdir(os.path.dirname(__file__)) add_dir(slb, 'js') add_dir(slb, 'css') add_dir(slb, 'MathJax') os.chdir(current_dir) CouchArticleSource(args, slb).run() p('\nAll done in %s\n' % end('all'))
def main(): logging.basicConfig() def p(text): sys.stdout.write(text) sys.stdout.flush() times = {} def begin(name): times[name] = time.time() def end(name): t0 = times.pop(name) dt = timedelta(seconds=int(time.time() - t0)) return dt def observer(e): if e.name == 'begin_finalize': p('\nFinished adding content in %s' % end('content')) p('\nFinalizing...') begin('finalize') if e.name == 'end_finalize': p('\nFinalized in %s' % end('finalize')) elif e.name == 'begin_resolve_aliases': p('\nResolving aliases...') begin('aliases') elif e.name == 'end_resolve_aliases': p('\nResolved aliases in %s' % end('aliases')) elif e.name == 'begin_sort': p('\nSorting...') begin('sort') elif e.name == 'end_sort': p(' sorted in %s' % end('sort')) args = parse_args() outname = args.output_file if outname is None: basename = os.path.basename(args.couch_url) noext, _ext = os.path.splitext(basename) outname = os.path.extsep.join((noext, args.compression, 'slob')) def set_tag_from_args(slb, name): value = getattr(args, name.replace('.', '_')) if value: slb.tag(name, value) with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size * 1024, observer=observer) as slb: begin('content') #create tags slb.tag('license.name', '') slb.tag('license.url', '') slb.tag('created.by', '') slb.tag('copyright', '') article_source = CouchArticleSource(args, slb) begin('all') #command args override article source set_tag_from_args(slb, 'license.name') set_tag_from_args(slb, 'license.url') set_tag_from_args(slb, 'created.by') article_source.run() include_built_in = {'js', 'css', 'images'} if not args.no_math: include_built_in.add('MathJax') content_dir = os.path.dirname(__file__) slob.add_dir(slb, content_dir, include_only=include_built_in, prefix='~/') if args.content_dirs: for content_dir in args.content_dirs: slob.add_dir(slb, content_dir) p('\nAll done in %s\n' % end('all'))
def main(): logging.basicConfig() def p(text): sys.stdout.write(text) sys.stdout.flush() times = {} def begin(name): times[name] = time.time() def end(name): t0 = times.pop(name) dt = timedelta(seconds=int(time.time() - t0)) return dt def observer(e): if e.name == 'begin_finalize': p('\nFinished adding content in %s' % end('content')) p('\nFinalizing...') begin('finalize') if e.name == 'end_finalize': p('\nFinalized in %s' % end('finalize')) elif e.name == 'begin_resolve_aliases': p('\nResolving aliases...') begin('aliases') elif e.name == 'end_resolve_aliases': p('\nResolved aliases in %s' % end('aliases')) elif e.name == 'begin_sort': p('\nSorting...') begin('sort') elif e.name == 'end_sort': p(' sorted in %s' % end('sort')) args = parse_args() outname = args.output_file if outname is None: basename = os.path.basename(args.couch_url) noext, _ext = os.path.splitext(basename) outname = os.path.extsep.join((noext, args.compression, 'slob')) def set_tag_from_args(slb, name): value = getattr(args, name.replace('.', '_')) if value: slb.tag(name, value) with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size*1024, observer=observer) as slb: begin('content') #create tags slb.tag('license.name', '') slb.tag('license.url', '') slb.tag('created.by', '') slb.tag('copyright', '') article_source = CouchArticleSource(args, slb) begin('all') #command args override article source set_tag_from_args(slb, 'license.name') set_tag_from_args(slb, 'license.url') set_tag_from_args(slb, 'created.by') article_source.run() include_built_in = {'js', 'css', 'images'} if not args.no_math: include_built_in.add('MathJax') content_dir = os.path.dirname(__file__) slob.add_dir(slb, content_dir, include_only=include_built_in, prefix='~/') if args.content_dirs: for content_dir in args.content_dirs: slob.add_dir(slb, content_dir) p('\nAll done in %s\n' % end('all'))
def main(): args = parse_args() fnames = [os.path.expanduser(name) for name in args.input_file] outname = args.output_file if outname is None: basename = os.path.basename(fnames[0]) noext, _ext = os.path.splitext(basename) outname = os.path.extsep.join((noext, args.compression, 'slob')) t0 = time.time() sort_t0 = None aliases_t0 = None def observer(e): nonlocal t0, sort_t0, aliases_t0 if e.name == 'begin_finalize': p('\nFinished adding content in %.2fs' % (time.time() - t0)) t0 = time.time() p('\nFinalizing...') if e.name == 'end_finalize': p('\nFinilized in %.2fs' % (time.time() - t0)) elif e.name == 'begin_resolve_aliases': p('\nResolving aliases...') aliases_t0 = time.time() elif e.name == 'end_resolve_aliases': p('\nResolved aliases in %.2fs' % (time.time() - aliases_t0)) elif e.name == 'begin_sort': p('\nSorting...') sort_t0 = time.time() elif e.name == 'end_sort': p(' sorted in %.2fs' % (time.time() - sort_t0)) with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size*1024, observer=observer) as w: css_tags = [] for name in ('shared.css', 'mediawiki_shared.css', 'mediawiki_monobook.css'): key = '_' + name w.add(read_file(name), key, content_type=CSS) css_tags.append(LINK_TAG.format(key)) name = 'night.css' key = '_' + name w.add(read_file('night.css'), key, content_type=CSS) css_tags.append(NIGHT_LINK_TAG.format(key)) css_tags = '\n'.join(css_tags) w.tag('license.name', args.license_name) w.tag('license.url', args.license_url) w.tag('created.by', args.created_by) for fname in fnames: with closing(dictionary.Volume(fname)) as d: article_url_template = d.article_url w.tag('label', d.title if d.title else fname) source = d.source if d.source else '' w.tag('source', d.source if d.source else '') w.tag('uri', args.uri if args.uri else source) w.tag('copyright', d.copyright if d.copyright else '') count = len(d.articles) start = args.start if args.start else 0 end = args.end if args.end else count articles = ((d.words[i], d.articles[i], css_tags, article_url_template) for i in range(start, end)) workers = multiprocessing.Pool() result = workers.imap_unordered(convert, articles) for i, converted in enumerate(result): if i % 100 == 0 and i != 0: p('.') if i and i % 5000 == 0: p(' {0:.2f}%\n'.format(100*(i/count))) redirect, content, key_frag = converted if content is None: p('x') continue if redirect: w.add_alias(content, key_frag) else: w.add(content, key_frag, content_type=HTML) p('\nWrote {0}\n'.format(outname))
def main(): args = parse_args() fnames = [os.path.expanduser(name) for name in args.input_file] outname = args.output_file if outname is None: basename = os.path.basename(fnames[0]) noext, _ext = os.path.splitext(basename) outname = os.path.extsep.join((noext, args.compression, 'slob')) t0 = time.time() sort_t0 = None aliases_t0 = None def observer(e): nonlocal t0, sort_t0, aliases_t0 if e.name == 'begin_finalize': p('\nFinished adding content in %.2fs' % (time.time() - t0)) t0 = time.time() p('\nFinalizing...') if e.name == 'end_finalize': p('\nFinilized in %.2fs' % (time.time() - t0)) elif e.name == 'begin_resolve_aliases': p('\nResolving aliases...') aliases_t0 = time.time() elif e.name == 'end_resolve_aliases': p('\nResolved aliases in %.2fs' % (time.time() - aliases_t0)) elif e.name == 'begin_sort': p('\nSorting...') sort_t0 = time.time() elif e.name == 'end_sort': p(' sorted in %.2fs' % (time.time() - sort_t0)) with slob.create(outname, compression=args.compression, workdir=args.work_dir, min_bin_size=args.bin_size * 1024, observer=observer) as w: css_tags = [] for name in ('shared.css', 'mediawiki_shared.css', 'mediawiki_monobook.css'): key = '_' + name w.add(read_file(name), key, content_type=CSS) css_tags.append(LINK_TAG.format(key)) name = 'night.css' key = '_' + name w.add(read_file('night.css'), key, content_type=CSS) css_tags.append(NIGHT_LINK_TAG.format(key)) css_tags = '\n'.join(css_tags) w.tag('license.name', args.license_name) w.tag('license.url', args.license_url) w.tag('created.by', args.created_by) for fname in fnames: with closing(dictionary.Volume(fname)) as d: article_url_template = d.article_url w.tag('label', d.title if d.title else fname) source = d.source if d.source else '' w.tag('source', d.source if d.source else '') w.tag('uri', args.uri if args.uri else source) w.tag('copyright', d.copyright if d.copyright else '') count = len(d.articles) start = args.start if args.start else 0 end = args.end if args.end else count articles = ((d.words[i], d.articles[i], css_tags, article_url_template) for i in range(start, end)) workers = multiprocessing.Pool() result = workers.imap_unordered(convert, articles) for i, converted in enumerate(result): if i % 100 == 0 and i != 0: p('.') if i and i % 5000 == 0: p(' {0:.2f}%\n'.format(100 * (i / count))) redirect, content, key_frag = converted if content is None: p('x') continue if redirect: w.add_alias(content, key_frag) else: w.add(content, key_frag, content_type=HTML) p('\nWrote {0}\n'.format(outname))