def yield_actions_for_lang(self, lang, size): stored_mtimes = { hit["_id"]: hit["_source"]["mtime"] for hit in scan( self.es, index=self.index_name, doc_type="text", _source_include=["mtime"], query=None, size=500, ) } current_html_mtimes = list(get_db().aql.execute( CURRENT_MTIMES, bind_vars={ 'lang': self.lang, '@collection': 'html_text' }, )) current_bilara_mtimes = list(get_db().aql.execute( CURRENT_BILARA_MTIMES, bind_vars={ 'lang': self.lang, '@collection': 'sc_bilara_texts' })) to_add = set() to_delete = set(stored_mtimes) for doc in chain(current_html_mtimes, current_bilara_mtimes): _id = self.make_id(doc['uid'], doc['author_uid']) if _id in to_delete: to_delete.remove(_id) if _id not in stored_mtimes or stored_mtimes[_id] != int( doc['mtime']): to_add.add(_id) delete_actions = [] for _id in to_delete: delete_actions.append({'_id': _id, '_op_type': 'delete'}) if delete_actions: print( f'Deleting {len(delete_actions)} documents from {lang} index') yield delete_actions if to_add: print( f'Indexing {len(to_add)} new or modified texts to {lang} index' ) yield from self.yield_html_texts(lang, size, to_add=to_add) yield from self.yield_bilara_texts(lang, size, to_add=to_add)
def get(self): db = get_db() try: data = list(db.aql.execute(PWA.SIZES))[0] return data, 200 except IndexError: return 'Language not found', 404
def get(self): """ Send list of textual information paragraphs for the sutta view --- responses: 200: schema: id: paragraphs type: array items: $ref: '#/definitions/paragraph' definitions: paragraph: type: object properties: uid: type: string description: type: string """ db = get_db() data = db.aql.execute(PARAGRAPHS) return list(data), 200
def get(self): ''' responses: 200: description: Summary of translation counts by language schema: type: object properties: modern: type: array items: $ref: '#/definitions/TranslationCount' ancient: type: array items: $ref: '#/definitions/TranslationCount' definitions: TranslationCount: type: object properties: iso_code: type: string name: type: string total: type: number ''' db = get_db() response = next(db.aql.execute(TRANSLATION_COUNT_BY_LANGUAGE)) return response, 200
def remove_collections(self): """ Remove old dictionary collections """ db = get_db() db.delete_collection('dictionaries') db.delete_collection('dictionary_full')
def get(self, division, vol, page): """ Send list of images for given division. --- responses: 200: schema: id: images type: array items: type: object properties: name: type: string page: type: number """ db = get_db() data = db.aql.execute(IMAGES, bind_vars={ 'division': division, 'vol': vol, 'page': page }) return list(data), 200
def create_collection(self): db = get_db() currencies = db.create_collection('images') currencies.add_hash_index(fields=['division'], unique=False) currencies.add_hash_index(fields=['name'], unique=True) currencies.add_hash_index(fields=['vol'], unique=False)
def create_view(self): db = get_db() db.create_view( 'v_text', view_type='arangosearch', properties={ 'links': { 'html_text': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] }, } }, 'po_strings': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] }, } }, } }, )
def get_data(self, submenu_id=None, menu_query=MENU, submenu_query=SUBMENU, **kwargs): db = get_db() bind_vars = kwargs.get('bind_vars', {}) if submenu_id: bind_vars['submenu_id'] = submenu_id divisions = db.aql.execute(submenu_query, bind_vars=bind_vars) data = list(divisions) else: divisions = db.aql.execute(menu_query, bind_vars=bind_vars) data = self.group_by_parents(divisions, ['pitaka']) for pitaka in data: if 'children' in pitaka: uid = pitaka['uid'] children = pitaka.pop('children') if uid == 'pitaka/sutta': pitaka['children'] = self.group_by_parents( children, ['grouping']) else: pitaka['children'] = self.group_by_parents( children, ['sect']) self.group_by_language(pitaka, exclude={'sect/other'}) self.recursive_cleanup(data, mapping={}) return data
def get(self): """ Send list of random epigraphs --- responses: 200: schema: id: epigraphs type: array items: type: object properties: uid: type: string epigraph: type: string """ db = get_db() try: limit = int(request.args.get('limit', '10')) except ValueError: limit = 10 data = db.aql.execute(EPIGRAPHS, bind_vars={'number': limit}) return list(data), 200
def create_collection(self): db = get_db() epigraphs = db.create_collection('epigraphs') epigraphs.add_hash_index(fields=['uid'], unique=False) why_we_read = db.create_collection('why_we_read')
def create_collections(self): db = get_db() collections = [ ('grouping', False), ('language', False), ('pitaka', False), ('sect', False), ('root', False), ('root_edges', True), ('relationship', True), ('html_text', False), ('unicode_points', False), ('mtimes', False), ] for name, edge in collections: db.create_collection(name=name, edge=edge) # create indexes db['html_text'].add_hash_index(fields=["uid"], unique=False) db['html_text'].add_hash_index(fields=["author_uid"], unique=False) db['html_text'].add_hash_index(fields=["lang"], unique=False) db['root'].add_hash_index(fields=["uid"], unique=False)
def update_view(self): db = get_db() db.replace_view( 'v_text', properties={ 'links': { 'html_text': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] }, } }, 'sc_bilara_texts': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] }, } }, } }, )
def get(self, collection=None): """ Accept list of languages in format `?languages=lang1,lang2,...` --- parameters: - in: query name: languages type: string required: true - in: query name: include_root type: boolean required: false responses: 200: type: object properties: menu: type: array items: type: string suttaplex: type: array items: type: string texts: type: array items: type: object properties: uid: type: string translations: type: array items: type: object properties: lang: type: string authors: type: array items: type: string """ languages = request.args.get('languages', '') root_lang = request.args.get('root_lang', 'false').lower() root_lang = {'true': True, 'false': False}[root_lang] if not languages and not root_lang: return 'Language not specified', 404 languages = languages.split(',') if languages else [] db = get_db() return next( db.aql.execute(PWA.MENU, bind_vars={ 'languages': languages, 'include_root': root_lang }))
def get(self, url): print(url) db = get_db() parts = url.split('/') if len(parts) == 2: lang, uid = parts if lang == 'pi': lang = 'pli' languages = db.collection('language') if lang in languages: hits = db.aql.execute(''' LET modern = (FOR text IN po_strings FILTER text.lang == @lang FILTER text.uid == @uid RETURN {author_uid: text.author_uid, legacy: false}) LET legacy = (FOR text IN html_text FILTER text.lang == @lang FILTER text.uid == @uid RETURN {author_uid: text.author_uid, legacy: true}) RETURN APPEND(modern, legacy) ''', bind_vars={"lang": lang, "uid": uid}).next() if hits: author_uid = hits[0]['author_uid'] return "Redirect", 301, {'Location': f'/{uid}/{lang}/{author_uid}'} else: root = db.collection('root') if uid in root: return "Redirect", 301, {'Location': f'/{uid}'} return "Not found", 403
def get(self, uid): db = get_db() full_path = db.aql.execute(SUTTA_PATH, bind_vars={'uid': uid}).next() if full_path['full_path'].count('/sutta/minor') > 1: rIndex = full_path['full_path'].rfind('/sutta/minor') full_path['full_path'] = full_path['full_path'][:rIndex] return full_path
def create_collections(self): db = get_db() collections = ['po_markup', 'po_strings', 'uid_expansion'] for name in collections: db.create_collection(name=name)
def get_data(self, submenu_id=None, menu_query=MENU, submenu_query=SUBMENU, language=None, bind_vars=None): db = get_db() if bind_vars is None: bind_vars = {'language': language} if submenu_id: bind_vars['submenu_id'] = submenu_id divisions = list(db.aql.execute(submenu_query, bind_vars=bind_vars)) else: divisions = list(db.aql.execute(menu_query, bind_vars=bind_vars)) if submenu_id: data = divisions else: data = self.group_by_parents(divisions, ['pitaka']) for pitaka in data: if 'children' in pitaka: children = pitaka.pop('children') if pitaka['uid'] == 'pitaka/sutta': pitaka['children'] = self.group_by_parents(children, ['grouping']) else: pitaka['children'] = self.group_by_parents(children, ['sect']) self.group_by_language(pitaka, exclude={'sect/other'}) self.recursive_cleanup(data, language=language, mapping={}) self.make_yellow_brick_road(data, language) return data
def create_view(self): db = get_db() db.create_view( 'v_text', view_type='arangosearch', properties={ 'links': { 'html_text': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] } } }, 'po_strings': { 'fields': { 'uid': { 'analyzers': ['identity'] }, 'lang': { 'analyzers': ['identity'] } } } } } )
def delete_db(): arangodb.delete_db(arangodb.get_db()) from flask import current_app storage_dir = current_app.config.get('STORAGE_DIR') for file_path in storage_dir.glob('.*'): file_path.unlink()
def retrieve_data(division_uid, language, author): db = get_db() docs = list( db.aql.execute(QUERY, bind_vars={ 'uid': division_uid, 'author': author, 'language': language })) docs = [doc for doc in docs if doc['type'] != 'text' or doc['text']] texts = [doc['text'] for doc in docs if doc['text']] author = texts[0]['author'] for text in texts: if 'name' in text: text['title'] = text.pop('name') toc = ['<div><h1>Guide</h1>'] last_depth = -1 for i, doc in enumerate(docs): depth = doc['depth'] if depth < 7: if depth > last_depth: toc.append('<ul>\n') elif depth < last_depth: toc.append('</ul>\n' * (last_depth - depth)) if doc['name']: toc.append('<li>\n') acronym = get_acronym(doc) if doc['text']: doc['text']['acronym'] = acronym title = doc["text"]["title"] long_title = f'{acronym}{": " if title else ""}{title}' toc.append( f'<b><a href="./{doc["uid"]}.xhtml">{long_title}</a></b>' ) if doc.get('name'): toc.append(f'<br><i>{doc["name"]}</i>') doc['text']['long_title'] = long_title else: toc.append(f'<b>{doc["name"]}</b>') if doc.get('title'): toc.append(f'<br><i>{doc["title"]}</i>') if doc['blurb'] and i > 0: toc.append(f'<br>{doc["blurb"]}') if i >= len(docs) - 1 or docs[i + 1]['depth'] <= doc['depth']: toc.append('</li>\n') last_depth = depth root = lxml.html.fromstring(''.join(toc)) toc_string = lxml.html.tostring(root, encoding='unicode') return { 'root_title': docs[0]['name'], 'blurb': docs[0]['blurb'], 'author': author, 'toc': toc_string, 'texts': texts }
def has_translated_descendent(uid, lang, _cache={}): if lang not in _cache: db = get_db() uids = next(db.aql.execute(AVAILABLE_TRANSLATIONS_LIST, bind_vars={'lang': lang})) _cache[lang] = set(uids) lang_mapping = _cache[lang] return uid in lang_mapping
def get(self, word=None): db = get_db() data = db.aql.execute(DICTIONARY_SIMILAR, bind_vars={ 'word': word, 'word_ascii': asciify(word) }) return list(data), 200
def run(): languages = get_non_root_languages(get_db()) data = {} for lang in tqdm(languages): size = check_language(lang) data[lang] = size data['root'] = get_root_size() data['root']['lookup'] = get_lookup_sizes() save_results(data)
def add_indexes(self): db = get_db() po_markup = db['po_markup'] po_markup.add_hash_index(fields=['uid'], unique=False) language = db['language'] language.add_hash_index(fields=['uid'], unique=True) language.add_hash_index(fields=['is_root'], unique=False)
def get(self): """ Send parallel information for given sutta. --- parameters: - in: query name: from type: string required: true - in: query name: to type: string - in: query name: fallback type: string responses: 200: schema: id: dictionary type: object properties: from: type: string to: type: string dictionary: type: object items: type: array items: type: string """ to_lang = request.args.get('to', current_app.config.get('DEFAULT_LANGUAGE')) from_lang = request.args.get('from', None) fallback = request.args.get('fallback', 'false') main_dict = False if fallback == 'true' else True if from_lang is None: return 'from not specified', 422 db = get_db() result = db.aql.execute( DICTIONARIES, bind_vars={ 'from': from_lang, 'to': to_lang, 'main': main_dict }, ) try: return result.next(), 200 except StopIteration: return 'Dictionary not found', 404
def create_collection(self): db = get_db() dictionaries = db.create_collection('dictionaries') dictionaries.add_hash_index(fields=['from'], unique=False) dictionaries.add_hash_index(fields=['to'], unique=False) dictionaries.add_hash_index(fields=['lookup'], unique=False) dictionaries.add_hash_index(fields=['main'], unique=False) dictionaries.add_hash_index(fields=['type'], unique=False)
def yield_actions_for_lang(self, lang, size): stored_mtimes = {hit["_id"]: hit["_source"]["mtime"] for hit in scan(self.es, index=self.index_name, doc_type="text", _source_include=[ "mtime"], query=None, size=500)} current_html_mtimes = list(get_db().aql.execute(CURRENT_MTIMES, bind_vars={ 'lang': self.lang, '@collection': 'html_text' })) current_po_mtimes = list(get_db().aql.execute(CURRENT_MTIMES, bind_vars={ 'lang': self.lang, '@collection': 'po_strings' })) to_add = set() to_delete = set(stored_mtimes) for doc in chain(current_html_mtimes, current_po_mtimes): _id = self.make_id(doc['uid'], doc['author_uid']) if _id in to_delete: to_delete.remove(_id) if _id not in stored_mtimes or stored_mtimes[_id] != int(doc['mtime']): to_add.add(_id) delete_actions = [] for _id in to_delete: delete_actions.append({ '_id': _id, '_op_type': 'delete' }) if delete_actions: print(f'Deleting {len(delete_actions)} documents from {lang} index') yield delete_actions if to_add: print(f'Indexing {len(to_add)} new or modified texts to {lang} index') yield from self.yield_po_texts(lang, size, to_add=to_add) yield from self.yield_html_texts(lang, size, to_add=to_add)
def test_ensure_migration_collection_exists(): db = get_db() collection = base.Migration.migrations_collection db.collection(collection) db.delete_collection(collection) runner._ensure_migration_collection_exists() db.collection(collection)
def create_collections(self): db = get_db() difficulties = db.create_collection('difficulties') difficulties.add_hash_index(fields=['uid'], unique=False) difficulties.add_hash_index(fields=['difficulty'], unique=False) blurbs = db.create_collection('blurbs') blurbs.add_hash_index(fields=['uid'], unique=False) blurbs.add_hash_index(fields=['lang'], unique=False)
def generate_lookup_dict(_from, to): db = get_db() db['dictionaries'].insert({ 'from': _from, 'to': to, 'dictionary': [], 'lookup': True, 'main': True, 'type': 'maindata', })
def run(self): """ Run all tasks from self.tasks list in given order. """ db = get_db() if not db.collection(self.migrations_collection).has(self.migration_id): logging.info(f' * Running {self.migration_id}') for task in self.tasks: getattr(self, task)() self.end_migrations()
def _ensure_migration_collection_exists(): """ Creates migration collection if it does not exists yet. """ db = get_db() try: db.create_collection(Migration.migrations_collection, user_keys=True) except CollectionCreateError as e: if '[ERR 1207] duplicate name' not in str(e): raise e
def generate_lookup_dict(_from, to): db = get_db() db['dictionaries'].insert({ 'from': _from, 'to': to, 'dictionary': [], 'lookup': True, 'main': True, 'type': 'maindata' })
def get(self, word=None): db = get_db() language = request.args.get('language', current_app.config.get('DEFAULT_LANGUAGE')) data = db.aql.execute(DICTIONARY_FULL, bind_vars={ 'word': word, 'language': language }).next() return list(data), 200
def has_translated_descendent(uid, lang, _cache={}): if lang not in _cache: db = get_db() uids = next( db.aql.execute(AVAILABLE_TRANSLATIONS_LIST, bind_vars={'lang': lang})) _cache[lang] = set(uids) lang_mapping = _cache[lang] return uid in lang_mapping
def run(): db = get_db() Path(GENERATED_PO_FILES_DIR).mkdir(parents=True, exist_ok=True) process_blurbs(db) process_menu(db) process_currencies(db) change_po_file_permissions()
def get(self, collection=None): """ Accept list of languages in format `?languages=lang1,lang2,...` --- parameters: - in: query name: languages type: string required: true - in: query name: include_root type: boolean required: false responses: 200: type: object properties: menu: type: array items: type: string suttaplex: type: array items: type: string texts: type: array items: type: object properties: uid: type: string translations: type: array items: type: object properties: lang: type: string authors: type: array items: type: string """ languages = request.args.get('languages', '') root_lang = request.args.get('root_lang', 'false').lower() root_lang = {'true': True, 'false': False}[root_lang] if not languages and not root_lang: return 'Language not specified', 404 languages = languages.split(',') if languages else [] db = get_db() return next(db.aql.execute(PWA.MENU, bind_vars={'languages': languages, 'include_root': root_lang}))
def end_migrations(self): """ Run at the end of each migration file and add document to the db about finishing the migrations. """ db = get_db() migrations = db.collection(self.migrations_collection) migrations.insert({ '_key': self.migration_id, 'date': str(datetime.now()) })
def yield_html_texts(self, lang, size, to_add): html_texts = list(get_db().aql.execute( TEXTS_BY_LANG, bind_vars={'lang': lang} )) if not html_texts: return chunk = [] chunk_size = 0 for i, text in enumerate(html_texts): uid = text['uid'] author_uid = text['author_uid'] _id = self.make_id(uid, author_uid) if _id not in to_add: continue try: with open(text['file_path'], 'rb') as f: html_bytes = f.read() chunk_size += len(html_bytes) + 512 root_lang = text['root_lang'] action = { 'acronym': text['acronym'], '_id': _id, 'uid': uid, 'lang': lang, 'root_lang': root_lang, 'author': text['author'], 'author_uid': author_uid, 'author_short': text['author_short'], 'is_root': lang == root_lang, 'mtime': int(text['mtime']) } action.update(self.extract_fields_from_html(html_bytes)) chunk.append(action) except (ValueError, IndexError) as e: logger.exception(f'{text["uid"]}, {e}') if chunk_size > size: yield chunk chunk = [] chunk_size = 0 time.sleep(0.25) if chunk: yield chunk
def get(self): """ Send parallel information for given sutta. --- parameters: - in: query name: from type: string required: true - in: query name: to type: string - in: query name: fallback type: string responses: 200: schema: id: dictionary type: object properties: from: type: string to: type: string dictionary: type: object items: type: array items: type: string """ to_lang = request.args.get('to', current_app.config.get('DEFAULT_LANGUAGE')) from_lang = request.args.get('from', None) fallback = request.args.get('fallback', 'false') main_dict = False if fallback == 'true' else True if from_lang is None: return 'from not specified', 422 db = get_db() result = db.aql.execute(DICTIONARIES, bind_vars={'from': from_lang, 'to': to_lang, 'main': main_dict}) try: return result.next(), 200 except StopIteration: return 'Dictionary not found', 404
def retrieve_data(division_uid, language, author): db = get_db() docs = list(db.aql.execute(QUERY, bind_vars={'uid': division_uid, 'author': author, 'language': language})) docs = [doc for doc in docs if doc['type'] != 'text' or doc['text']] texts = [doc['text'] for doc in docs if doc['text']] author = texts[0]['author'] for text in texts: if 'name' in text: text['title'] = text.pop('name') toc = ['<div><h1>Guide</h1>'] last_depth = -1 for i, doc in enumerate(docs): depth = doc['depth'] if depth < 7: if depth > last_depth: toc.append('<ul>\n') elif depth < last_depth: toc.append('</ul>\n' * (last_depth - depth)) if doc['name']: toc.append('<li>\n') acronym = get_acronym(doc) if doc['text']: doc['text']['acronym'] = acronym title = doc["text"]["title"] long_title = f'{acronym}{": " if title else ""}{title}' toc.append(f'<b><a href="./{doc["uid"]}.xhtml">{long_title}</a></b>') if doc.get('name'): toc.append(f'<br><i>{doc["name"]}</i>') doc['text']['long_title'] = long_title else: toc.append(f'<b>{doc["name"]}</b>') if doc.get('title'): toc.append(f'<br><i>{doc["title"]}</i>') if doc['blurb'] and i > 0: toc.append(f'<br>{doc["blurb"]}') if i >= len(docs) - 1 or docs[i+1]['depth'] <= doc['depth']: toc.append('</li>\n') last_depth = depth root = lxml.html.fromstring(''.join(toc)) toc_string = lxml.html.tostring(root, encoding='unicode') return { 'root_title': fix_main_title(docs[0]['name'], division_uid), 'blurb': docs[0]['blurb'], 'author': author, 'toc': toc_string, 'texts': texts}
def update(force=False): def sort_key(d): if d == 'en': return 0 if d == 'pli': return 1 return 10 db = get_db() languages = sorted(db.aql.execute('FOR l IN language RETURN l.uid'), key=sort_key) for lang in tqdm(languages): indexer = TextIndexer(lang) indexer.update()
def save(self): """Saves document to the db. """ db = get_db() try: collection = db.collection(self.collection) except CollectionLoadError: collection = db.create_collection(self.collection, edge=self.edge) result = collection.insert(self.document) self._rev = result['_rev'] self._id = result['_id'] return self
def get(self, word=None): """ Send list of similar terms to dictionary search word --- responses: glossary: type: array items: type: string """ db = get_db() data = db.aql.execute(DICTIONARY_SIMILAR, bind_vars={'word': word, 'word_ascii': asciify(word)}) return list(data), 200
def get(self, word=None): """ Send list of adjacent terms to dictionary search word --- responses: glossary: type: array items: type: string """ db = get_db() data = db.aql.execute(DICTIONARY_ADJACENT, bind_vars={'word': word}) return list(data), 200
def yield_po_texts(self, lang, size, to_add): po_texts = list(get_db().aql.execute( PO_TEXTS_BY_LANG, bind_vars={'lang': lang} )) if not po_texts: return chunk = [] chunk_size = 0 for text in po_texts: uid = text['uid'] author_uid = text['author_uid'] _id = self.make_id(uid, author_uid) if _id not in to_add: continue with open(text['strings_path']) as f: strings = json.load(f) action = { 'acronym': text['acronym'], '_id': _id, 'uid': uid, 'lang': lang, 'author': text['author'], 'author_uid': text['author_uid'], 'author_short': text['author_short'], 'is_root': lang == text['root_lang'], 'mtime': int(text['mtime']), 'heading': { 'title': self.fix_text(text['title']), 'division': [self.fix_text(text['division_title']) if 'division_title' in text else ''] }, 'content': '\n\n'.join(strings.values()) } chunk_size += len(action['content'].encode('utf-8')) chunk.append(action) if chunk_size > size: yield chunk chunk = [] chunk_size = 0 time.sleep(0.25) if chunk: yield chunk