def test_add_dataset(): c = Cihai() c.add_dataset(SimplestDataset, namespace='simple') assert hasattr(c, 'simple') assert isinstance(c.simple, extend.Dataset) assert hasattr(c.simple, 'a_method') assert callable(c.simple.a_method) assert c.simple.a_method() == 'hi'
def __init__(self): super().__init__() self.c = Cihai() if not self.c.is_bootstrapped: # download and install Unihan to db bootstrap_unihan(self.c.metadata) self.c.reflect_db()
def run(): c = Cihai(unihan=False) c.add_dataset(MyDataset, namespace='moo') c.moo.bootstrap() print('Definitions exactly for 好', c.moo.givemedata('好')) print('Definitions matching with 你好:', ', '.join(c.moo.search('好'))) print('Reverse definition with Good:', ', '.join(c.moo.backwards('Good')))
def test_reflect_db(tmpdb_file, unihan_options): c = Cihai({ 'database': { 'url': 'sqlite:///{tmpdb_file}'.format(tmpdb_file=tmpdb_file) } }) assert not c.is_bootstrapped bootstrap.bootstrap_unihan(c.metadata, unihan_options) assert not hasattr(c.base.classes, 'Unihan') c.reflect_db() assert hasattr(c.base.classes, 'Unihan')
def run(unihan_options={}): c = Cihai(unihan=False) c.add_dataset('cihai.data.unihan.dataset.Unihan', namespace='unihan') if not c.unihan.is_bootstrapped: # download and install Unihan to db c.unihan.bootstrap(unihan_options) query = c.unihan.lookup_char('好') glyph = query.first() print("lookup for 好: %s" % glyph.kDefinition) query = c.unihan.reverse_char('good') print('matches for "good": %s ' % ', '.join([glph.char for glph in query]))
def run(unihan_options={}): c = Cihai() if not c.unihan.is_bootstrapped: # download and install Unihan to db c.unihan.bootstrap(unihan_options) c.unihan.add_plugin('cihai.data.unihan.dataset.UnihanVariants', namespace='variants') print("This example prints some tricky cases of character-by-character " "Traditional-Simplified mapping.") print("https://www.unicode.org/reports/tr38/#N10211") print("3.7.1 bullet 4") for char in c.unihan.with_fields("kTraditionalVariant", "kSimplifiedVariant"): print("Character: {}".format(char.char)) trad = set(char.untagged_vars("kTraditionalVariant")) simp = set(char.untagged_vars("kSimplifiedVariant")) Unihan = c.sql.base.classes.Unihan if Unihan.char in trad and Unihan.char in simp: print("Case 1") else: print("Case 2 (non-idempotent)") for trad_var in trad: print("s2t: {}".format(trad_var)) for simp_var in simp: print("t2s: {}".format(simp_var))
def cli(ctx, config, log_level): """Retrieve CJK information via CLI. For help and example usage, see documentation: https://cihai-cli.git-pull.com and https://cihai.git-pull.com""" setup_logger(level=log_level.upper()) if config: c = Cihai.from_file(config) else: c = Cihai() if not c.unihan.is_bootstrapped: click.echo("Bootstrapping Unihan database") c.unihan.bootstrap(options=c.config.get('unihan_options', {})) ctx.obj['c'] = c # pass Cihai object down to other commands
def test_unihan_options(unihan_options, test_config_file): app = Cihai.from_file(test_config_file) bootstrap.bootstrap_unihan(app.metadata, unihan_options) assert 'Unihan' in app.metadata.tables assert app.metadata.tables['Unihan'].columns assert set(app.metadata.tables['Unihan'].columns.keys()) == \ set(bootstrap.UNIHAN_FIELDS + ['ucn', 'char']) assert bootstrap.is_bootstrapped(app.metadata)
def test_unihan_options(unihan_options, test_config_file): app = Cihai.from_file(test_config_file) bootstrap.bootstrap_unihan(app.sql.metadata, unihan_options) assert 'Unihan' in app.sql.metadata.tables assert app.sql.metadata.tables['Unihan'].columns assert set(app.sql.metadata.tables['Unihan'].columns.keys()) == set( bootstrap.UNIHAN_FIELDS + ['ucn', 'char'] ) assert bootstrap.is_bootstrapped(app.sql.metadata)
def test_config_dict_args(): """Accepts dict as config.""" expected = 'world' app = Cihai({'hello': expected}) result = app.config['hello'] assert result == expected
def test_add_dataset_unihan(unihan_options): c = Cihai() c.add_dataset(Unihan, namespace='unihan') assert hasattr(c, 'unihan') assert isinstance(c.unihan, extend.Dataset) c.unihan.sql c.unihan.bootstrap(options=unihan_options) U = c.sql.base.classes.Unihan first_glyph = (c.unihan.sql.session.query(U).filter( U.kDefinition.isnot(None)).first()) char = first_glyph.char assert (c.unihan.lookup_char( char=char).first().kDefinition == first_glyph.kDefinition) assert (c.unihan.reverse_char( hints=[first_glyph.kDefinition]).first().char == char ), 'works with list of column value matches' assert (c.unihan.reverse_char( hints=first_glyph.kDefinition).first().char == char ), 'works with strings' c.unihan.add_plugin(UnihanVariants, 'variants') assert hasattr(c.unihan, 'variants') def variant_list(field): for char in c.unihan.with_fields(field): variants = [] for var in char.untagged_vars(field): variants.append(var) yield (char, variants) result = {char: variants for (char, variants) in variant_list('kZVariant')} assert len(result.values()) > 0 assert len(result.keys()) > 0
def run(unihan_options={}): """Wrapped so we can test in tests/test_examples.py""" print("This example prints variant character data.") c = Cihai() if not c.unihan.is_bootstrapped: # download and install Unihan to db c.unihan.bootstrap(unihan_options) c.unihan.add_plugin('cihai.data.unihan.dataset.UnihanVariants', namespace='variants') print("## ZVariants") variant_list(c.unihan, "kZVariant") print("## kSemanticVariant") variant_list(c.unihan, "kSemanticVariant") print("## kSpecializedSemanticVariant") variant_list(c.unihan, "kSpecializedSemanticVariant")
#!/usr/bin/env python3 import sys from cihai.core import Cihai verbose = False debug = "" c_dict = Cihai() def codepoints2chars(codepoints): if codepoints is None: return None results = [] temp = codepoints.split() for each in temp: num = each[2:] #print("num={}".format(num)) # The [0] is to strip dictionary (e.g. Matthews) name if there is one results.append(f'\\u{num}'.encode().decode('unicode_escape')[0]) return results def slow_search(zi, shujuku): global verbose global debug global c_dict # See cihai.git-pull.com/api.html query = c_dict.unihan.lookup_char(zi) glyph = query.first()
from typing import List, Optional, Tuple, Iterator, Set from cihai.core import Cihai from django.http.request import HttpRequest from django.forms.models import model_to_dict from django.db import IntegrityError from django.db.models.query import QuerySet from .forms import SenseForm from .models import Headword, Sense, Phrase, Example logger = logging.getLogger(__name__) SEP_RE = re.compile(r'[;;,]') C = Cihai() if not C.unihan.is_bootstrapped: C.unihan.bootstrap() @dataclass class Entry: headword: str headword_sense_no: int char_strokes_first: str char_strokes_all: str only_letters: str root: str root_sense_no: int word_class: list focus: list
#!/usr/bin/env python # -*- coding: utf8 - *- from __future__ import unicode_literals, print_function from cihai.core import Cihai from cihai.bootstrap import bootstrap_unihan c = Cihai() if not c.is_bootstrapped: # download and install Unihan to db bootstrap_unihan(c.metadata) c.reflect_db() # automap new table created during bootstrap query = c.lookup_char('好') glyph = query.first() print("lookup for 好: %s" % glyph.kDefinition) query = c.reverse_char('good') print('matches for "good": %s ' % ', '.join([glph.char for glph in query]))
from sqlalchemy.sql import column from cihai.core import Cihai from cihai.bootstrap import bootstrap_unihan from utils import parse_vars print("References from https://www.unicode.org/reports/tr38/#N10211") cihan = Cihai() if not cihan.is_bootstrapped: bootstrap_unihan(cihan.metadata) cihan.reflect_db() double_var = ( cihan.session.query(cihan.base.classes.Unihan) .filter(column("kTraditionalVariant").isnot(None)) .filter(column("kSimplifiedVariant").isnot(None)) .all() ) print("## 3.7.1 bullet 4") for c in double_var: print("Character: {}".format(c.char)) trad = parse_vars(c.kTraditionalVariant) simp = parse_vars(c.kSimplifiedVariant) if c.char in trad and c.char in simp: print("Case 1") else:
import sys sys.__stdout__ = sys.stdout from cihai.core import Cihai from cihai.bootstrap import bootstrap_unihan c = Cihai() if not c.is_bootstrapped: # download and install Unihan to db bootstrap_unihan(c.metadata) c.reflect_db() def jp_reverse(word): query = c.reverse_char(word) return ', '.join([glyph.char for glyph in query])
class ExplainKanji(BaseFilter): def __init__(self): super().__init__() self.c = Cihai() if not self.c.is_bootstrapped: # download and install Unihan to db bootstrap_unihan(self.c.metadata) self.c.reflect_db() def __call__(self, chunk): from src.Sequencer import TextChunk, JingleChunk chunk = self._duplicate_chunk(chunk) result = [chunk] if not isinstance(chunk, TextChunk) or chunk.language != 'japanese': return result explanations = self._get_explanations(chunk.text) result.append( TextChunk(text='[', audible=False, printable=True, final=True)) for k, ons, kuns, explanation in explanations: result.append( TextChunk(text=k, language='japanese', audible=False, printable=True, final=True)) result.append( TextChunk(text='on', language='english', audible=True, printable=False, final=True)) result.append(JingleChunk(jingle='silence')) for on in ons: result.append( TextChunk(text=on, language='japanese', audible=True, printable=True, final=True)) result.append(JingleChunk(jingle='silence')) result.append( TextChunk(text='、', audible=False, printable=True, final=True)) result.append( TextChunk(text='koon', language='english', audible=True, printable=False, final=True)) result.append(JingleChunk(jingle='silence')) for kun in kuns: result.append( TextChunk(text=kun, language='japanese', audible=True, printable=True, final=True)) result.append(JingleChunk(jingle='silence')) result.append( TextChunk(text='、', audible=False, printable=True, final=True)) result.append(JingleChunk(jingle='definition')) result.append( TextChunk(text=explanation, language='english', audible=True, printable=True, final=True)) result.append( TextChunk(text=']', audible=False, printable=True, final=True)) result.append( JingleChunk(jingle='silence_long', audible=False, printable=True, final=True)) return result def _kanji_to_kana(self, char): glyph = self.c.lookup_char(char).first() if glyph is None: return None romaji_on = glyph.kJapaneseKun.lower() romaji_kun = glyph.kJapaneseOn.lower() jp_on = jaconv.alphabet2kana(romaji_on).split(' ') jp_kun = jaconv.hira2kata(jaconv.alphabet2kana(romaji_kun)).split(' ') return jp_on, jp_kun, glyph.kDefinition @staticmethod def is_kana(char): return ('\u30A0' <= char <= '\u30FF') or ( '\u3040' <= char <= '\u309F') # Katakana and Hiragana blocks @classmethod def is_kanji(cls, char): return not cls.is_kana(char) def _get_explanations(self, text): kanji = set(filter(self.is_kanji, text)) detail_list = [] for k in kanji: triplet = self._kanji_to_kana(k) if triplet is None: continue on, kun, definition = triplet # detail_list.append(f'{k}: {", ".join(on)}; {", ".join(kun)}; {definition}') detail_list.append((k, on, kun, definition)) return detail_list
def test_cihai_without_unihan(): app = Cihai(unihan=False) assert (UNIHAN_CONFIG.items() != app.config.items()), 'app can be initialized without unihan' assert not hasattr(app, 'unihan')
def test_yaml_config_and_override(test_config_file): app = Cihai.from_file(test_config_file) assert app.config['database']
def test_bootstraps_unihan_by_default(): app = Cihai() assert UNIHAN_CONFIG.items() == app.config.items() assert app.unihan, 'cihai bootstraps unihan by default'
def test_config_defaults(): """Test config defaults.""" app = Cihai() assert 'database' in app.config
from auto import * from pprint import pprint as pp from cihai.core import Cihai c=Cihai() # c.unihan.bootstrap({}) """ 造zào - create/make 话huà - speaking 辶 - radical - zǒu zhī páng (Chuò) - indicates motion 福fú - blessing 福fú - blessing 礻 - radical - shì - show reveal. 神shén - god. 一yī - one 口kǒu - mouth 田tián - field (garden) 造zào - create/make 話huà - speaking 辶 - radical 162 - zǒu zhī páng - walk/walking (indicates motion) 福fú - blessing 礻- radical 113 - shì (cult), represents 示Shì - reveal, manifest; demonstrate 神shén - god. 一yī - one 口kǒu - mouth 田tián - field (garden) 園yuán - garden 土tǔ - earth/clay 口kǒu - mouth 亻rén - radical (man)
# -*- coding: utf-8 -*- import xml.etree.ElementTree as elemTree import os, uuid from operator import methodcaller, itemgetter, mul from cihai.core import Cihai c = Cihai() import hms.tex as tex import hms.html as html import multiprocessing as mp from multiprocessing import Process, Lock, Queue, Value def generateIdent(): return str(uuid.uuid4()) def textify(e, spell, ident, coder=tex): total = '' part = e.text beforehand = False for child in e: if part != None: part = part.replace('.', '。').replace(',', ',').replace( ' ', '').replace('\t', '').replace('\n', '').replace(' ', '') if part != '': total += part beforehand = False if child.tag == 'quote': temp = textify(child, spell, ident, coder)