Beispiel #1
0
def test_add_dataset():
    c = Cihai()
    c.add_dataset(SimplestDataset, namespace='simple')
    assert hasattr(c, 'simple')
    assert isinstance(c.simple, extend.Dataset)
    assert hasattr(c.simple, 'a_method')
    assert callable(c.simple.a_method)
    assert c.simple.a_method() == 'hi'
Beispiel #2
0
    def __init__(self):
        super().__init__()

        self.c = Cihai()

        if not self.c.is_bootstrapped:  # download and install Unihan to db
            bootstrap_unihan(self.c.metadata)
            self.c.reflect_db()
Beispiel #3
0
def run():
    c = Cihai(unihan=False)

    c.add_dataset(MyDataset, namespace='moo')
    c.moo.bootstrap()

    print('Definitions exactly for 好', c.moo.givemedata('好'))

    print('Definitions matching with 你好:', ', '.join(c.moo.search('好')))

    print('Reverse definition with Good:', ', '.join(c.moo.backwards('Good')))
Beispiel #4
0
def run():
    c = Cihai(unihan=False)

    c.add_dataset(MyDataset, namespace='moo')
    c.moo.bootstrap()

    print('Definitions exactly for 好', c.moo.givemedata('好'))

    print('Definitions matching with 你好:', ', '.join(c.moo.search('好')))

    print('Reverse definition with Good:', ', '.join(c.moo.backwards('Good')))
Beispiel #5
0
def test_reflect_db(tmpdb_file, unihan_options):
    c = Cihai({
        'database': {
            'url': 'sqlite:///{tmpdb_file}'.format(tmpdb_file=tmpdb_file)
        }
    })
    assert not c.is_bootstrapped
    bootstrap.bootstrap_unihan(c.metadata, unihan_options)
    assert not hasattr(c.base.classes, 'Unihan')
    c.reflect_db()
    assert hasattr(c.base.classes, 'Unihan')
Beispiel #6
0
def run(unihan_options={}):
    c = Cihai(unihan=False)
    c.add_dataset('cihai.data.unihan.dataset.Unihan', namespace='unihan')

    if not c.unihan.is_bootstrapped:  # download and install Unihan to db
        c.unihan.bootstrap(unihan_options)

    query = c.unihan.lookup_char('好')
    glyph = query.first()
    print("lookup for 好: %s" % glyph.kDefinition)

    query = c.unihan.reverse_char('good')
    print('matches for "good": %s ' % ', '.join([glph.char for glph in query]))
Beispiel #7
0
def run(unihan_options={}):
    c = Cihai()
    if not c.unihan.is_bootstrapped:  # download and install Unihan to db
        c.unihan.bootstrap(unihan_options)

    c.unihan.add_plugin('cihai.data.unihan.dataset.UnihanVariants',
                        namespace='variants')

    print("This example prints some tricky cases of character-by-character "
          "Traditional-Simplified mapping.")
    print("https://www.unicode.org/reports/tr38/#N10211")
    print("3.7.1 bullet 4")

    for char in c.unihan.with_fields("kTraditionalVariant",
                                     "kSimplifiedVariant"):
        print("Character: {}".format(char.char))
        trad = set(char.untagged_vars("kTraditionalVariant"))
        simp = set(char.untagged_vars("kSimplifiedVariant"))
        Unihan = c.sql.base.classes.Unihan
        if Unihan.char in trad and Unihan.char in simp:
            print("Case 1")
        else:
            print("Case 2 (non-idempotent)")
        for trad_var in trad:
            print("s2t: {}".format(trad_var))
        for simp_var in simp:
            print("t2s: {}".format(simp_var))
Beispiel #8
0
def cli(ctx, config, log_level):
    """Retrieve CJK information via CLI.

    For help and example usage, see documentation:

    https://cihai-cli.git-pull.com and https://cihai.git-pull.com"""
    setup_logger(level=log_level.upper())
    if config:
        c = Cihai.from_file(config)
    else:
        c = Cihai()

    if not c.unihan.is_bootstrapped:
        click.echo("Bootstrapping Unihan database")
        c.unihan.bootstrap(options=c.config.get('unihan_options', {}))

    ctx.obj['c'] = c  # pass Cihai object down to other commands
Beispiel #9
0
def test_unihan_options(unihan_options, test_config_file):
    app = Cihai.from_file(test_config_file)
    bootstrap.bootstrap_unihan(app.metadata, unihan_options)
    assert 'Unihan' in app.metadata.tables
    assert app.metadata.tables['Unihan'].columns
    assert set(app.metadata.tables['Unihan'].columns.keys()) == \
        set(bootstrap.UNIHAN_FIELDS + ['ucn', 'char'])
    assert bootstrap.is_bootstrapped(app.metadata)
Beispiel #10
0
def test_unihan_options(unihan_options, test_config_file):
    app = Cihai.from_file(test_config_file)
    bootstrap.bootstrap_unihan(app.sql.metadata, unihan_options)
    assert 'Unihan' in app.sql.metadata.tables
    assert app.sql.metadata.tables['Unihan'].columns
    assert set(app.sql.metadata.tables['Unihan'].columns.keys()) == set(
        bootstrap.UNIHAN_FIELDS + ['ucn', 'char']
    )
    assert bootstrap.is_bootstrapped(app.sql.metadata)
Beispiel #11
0
def test_config_dict_args():
    """Accepts dict as config."""

    expected = 'world'

    app = Cihai({'hello': expected})

    result = app.config['hello']

    assert result == expected
Beispiel #12
0
def test_add_dataset_unihan(unihan_options):
    c = Cihai()
    c.add_dataset(Unihan, namespace='unihan')
    assert hasattr(c, 'unihan')
    assert isinstance(c.unihan, extend.Dataset)

    c.unihan.sql

    c.unihan.bootstrap(options=unihan_options)
    U = c.sql.base.classes.Unihan

    first_glyph = (c.unihan.sql.session.query(U).filter(
        U.kDefinition.isnot(None)).first())

    char = first_glyph.char
    assert (c.unihan.lookup_char(
        char=char).first().kDefinition == first_glyph.kDefinition)

    assert (c.unihan.reverse_char(
        hints=[first_glyph.kDefinition]).first().char == char
            ), 'works with list of column value matches'

    assert (c.unihan.reverse_char(
        hints=first_glyph.kDefinition).first().char == char
            ), 'works with strings'

    c.unihan.add_plugin(UnihanVariants, 'variants')
    assert hasattr(c.unihan, 'variants')

    def variant_list(field):
        for char in c.unihan.with_fields(field):
            variants = []
            for var in char.untagged_vars(field):
                variants.append(var)
            yield (char, variants)

    result = {char: variants for (char, variants) in variant_list('kZVariant')}

    assert len(result.values()) > 0
    assert len(result.keys()) > 0
Beispiel #13
0
def run(unihan_options={}):
    """Wrapped so we can test in tests/test_examples.py"""
    print("This example prints variant character data.")

    c = Cihai()
    if not c.unihan.is_bootstrapped:  # download and install Unihan to db
        c.unihan.bootstrap(unihan_options)

    c.unihan.add_plugin('cihai.data.unihan.dataset.UnihanVariants',
                        namespace='variants')

    print("## ZVariants")
    variant_list(c.unihan, "kZVariant")

    print("## kSemanticVariant")
    variant_list(c.unihan, "kSemanticVariant")

    print("## kSpecializedSemanticVariant")
    variant_list(c.unihan, "kSpecializedSemanticVariant")
Beispiel #14
0
#!/usr/bin/env python3

import sys
from cihai.core import Cihai

verbose = False
debug = ""
c_dict = Cihai()

def codepoints2chars(codepoints):
	if codepoints is None:
		return None
	results = []
	temp = codepoints.split()
	for each in temp:
		num = each[2:]
		#print("num={}".format(num))
		# The [0] is to strip dictionary (e.g. Matthews) name if there is one
		results.append(f'\\u{num}'.encode().decode('unicode_escape')[0])

	return results

def slow_search(zi, shujuku):

	global verbose
	global debug
	global c_dict

	# See cihai.git-pull.com/api.html
	query = c_dict.unihan.lookup_char(zi)
	glyph = query.first()
Beispiel #15
0
from typing import List, Optional, Tuple, Iterator, Set

from cihai.core import Cihai
from django.http.request import HttpRequest
from django.forms.models import model_to_dict
from django.db import IntegrityError
from django.db.models.query import QuerySet

from .forms import SenseForm
from .models import Headword, Sense, Phrase, Example

logger = logging.getLogger(__name__)

SEP_RE = re.compile(r'[;;,]')

C = Cihai()
if not C.unihan.is_bootstrapped:
    C.unihan.bootstrap()


@dataclass
class Entry:
    headword: str
    headword_sense_no: int
    char_strokes_first: str
    char_strokes_all: str
    only_letters: str
    root: str
    root_sense_no: int
    word_class: list
    focus: list
Beispiel #16
0
#!/usr/bin/env python
# -*- coding: utf8 - *-

from __future__ import unicode_literals, print_function

from cihai.core import Cihai
from cihai.bootstrap import bootstrap_unihan

c = Cihai()
if not c.is_bootstrapped:  # download and install Unihan to db
    bootstrap_unihan(c.metadata)
    c.reflect_db()  # automap new table created during bootstrap

query = c.lookup_char('好')
glyph = query.first()
print("lookup for 好: %s" % glyph.kDefinition)

query = c.reverse_char('good')
print('matches for "good": %s ' % ', '.join([glph.char for glph in query]))
Beispiel #17
0
from sqlalchemy.sql import column
from cihai.core import Cihai
from cihai.bootstrap import bootstrap_unihan
from utils import parse_vars

print("References from https://www.unicode.org/reports/tr38/#N10211")

cihan = Cihai()
if not cihan.is_bootstrapped:
    bootstrap_unihan(cihan.metadata)

cihan.reflect_db()


double_var = (
    cihan.session.query(cihan.base.classes.Unihan)
    .filter(column("kTraditionalVariant").isnot(None))
    .filter(column("kSimplifiedVariant").isnot(None))
    .all()
)


print("## 3.7.1 bullet 4")

for c in double_var:
    print("Character: {}".format(c.char))
    trad = parse_vars(c.kTraditionalVariant)
    simp = parse_vars(c.kSimplifiedVariant)
    if c.char in trad and c.char in simp:
        print("Case 1")
    else:
Beispiel #18
0
import sys
sys.__stdout__ = sys.stdout
from cihai.core import Cihai
from cihai.bootstrap import bootstrap_unihan

c = Cihai()

if not c.is_bootstrapped:  # download and install Unihan to db
    bootstrap_unihan(c.metadata)
    c.reflect_db()


def jp_reverse(word):
    query = c.reverse_char(word)
    return ', '.join([glyph.char for glyph in query])
Beispiel #19
0
class ExplainKanji(BaseFilter):
    def __init__(self):
        super().__init__()

        self.c = Cihai()

        if not self.c.is_bootstrapped:  # download and install Unihan to db
            bootstrap_unihan(self.c.metadata)
            self.c.reflect_db()

    def __call__(self, chunk):
        from src.Sequencer import TextChunk, JingleChunk

        chunk = self._duplicate_chunk(chunk)
        result = [chunk]

        if not isinstance(chunk, TextChunk) or chunk.language != 'japanese':
            return result

        explanations = self._get_explanations(chunk.text)

        result.append(
            TextChunk(text='[', audible=False, printable=True, final=True))

        for k, ons, kuns, explanation in explanations:
            result.append(
                TextChunk(text=k,
                          language='japanese',
                          audible=False,
                          printable=True,
                          final=True))

            result.append(
                TextChunk(text='on',
                          language='english',
                          audible=True,
                          printable=False,
                          final=True))
            result.append(JingleChunk(jingle='silence'))
            for on in ons:
                result.append(
                    TextChunk(text=on,
                              language='japanese',
                              audible=True,
                              printable=True,
                              final=True))
                result.append(JingleChunk(jingle='silence'))
                result.append(
                    TextChunk(text='、',
                              audible=False,
                              printable=True,
                              final=True))

            result.append(
                TextChunk(text='koon',
                          language='english',
                          audible=True,
                          printable=False,
                          final=True))
            result.append(JingleChunk(jingle='silence'))
            for kun in kuns:
                result.append(
                    TextChunk(text=kun,
                              language='japanese',
                              audible=True,
                              printable=True,
                              final=True))
                result.append(JingleChunk(jingle='silence'))
                result.append(
                    TextChunk(text='、',
                              audible=False,
                              printable=True,
                              final=True))

            result.append(JingleChunk(jingle='definition'))
            result.append(
                TextChunk(text=explanation,
                          language='english',
                          audible=True,
                          printable=True,
                          final=True))

        result.append(
            TextChunk(text=']', audible=False, printable=True, final=True))
        result.append(
            JingleChunk(jingle='silence_long',
                        audible=False,
                        printable=True,
                        final=True))

        return result

    def _kanji_to_kana(self, char):
        glyph = self.c.lookup_char(char).first()
        if glyph is None:
            return None
        romaji_on = glyph.kJapaneseKun.lower()
        romaji_kun = glyph.kJapaneseOn.lower()
        jp_on = jaconv.alphabet2kana(romaji_on).split(' ')
        jp_kun = jaconv.hira2kata(jaconv.alphabet2kana(romaji_kun)).split(' ')
        return jp_on, jp_kun, glyph.kDefinition

    @staticmethod
    def is_kana(char):
        return ('\u30A0' <= char <= '\u30FF') or (
            '\u3040' <= char <= '\u309F')  # Katakana and Hiragana blocks

    @classmethod
    def is_kanji(cls, char):
        return not cls.is_kana(char)

    def _get_explanations(self, text):
        kanji = set(filter(self.is_kanji, text))
        detail_list = []
        for k in kanji:
            triplet = self._kanji_to_kana(k)
            if triplet is None:
                continue
            on, kun, definition = triplet
            # detail_list.append(f'{k}: {", ".join(on)}; {", ".join(kun)}; {definition}')
            detail_list.append((k, on, kun, definition))
        return detail_list
Beispiel #20
0
def test_cihai_without_unihan():
    app = Cihai(unihan=False)
    assert (UNIHAN_CONFIG.items() !=
            app.config.items()), 'app can be initialized without unihan'
    assert not hasattr(app, 'unihan')
Beispiel #21
0
def test_yaml_config_and_override(test_config_file):
    app = Cihai.from_file(test_config_file)

    assert app.config['database']
Beispiel #22
0
def test_bootstraps_unihan_by_default():
    app = Cihai()
    assert UNIHAN_CONFIG.items() == app.config.items()
    assert app.unihan, 'cihai bootstraps unihan by default'
Beispiel #23
0
def test_config_defaults():
    """Test config defaults."""

    app = Cihai()

    assert 'database' in app.config
Beispiel #24
0
from auto import *
from pprint import pprint as pp
from cihai.core import Cihai
c=Cihai()
# c.unihan.bootstrap({})

"""
造zào - create/make
话huà - speaking
辶 - radical - zǒu zhī páng (Chuò) - indicates motion
福fú - blessing
福fú - blessing
礻 - radical - shì - show reveal.
神shén - god.
一yī - one
口kǒu - mouth
田tián - field (garden)

造zào - create/make
話huà - speaking
辶 - radical 162 - zǒu zhī páng - walk/walking (indicates motion)
福fú - blessing
礻- radical 113 - shì (cult), represents 示Shì - reveal, manifest; demonstrate
神shén - god.
一yī - one
口kǒu - mouth
田tián - field (garden)
園yuán - garden
土tǔ - earth/clay
口kǒu - mouth
亻rén - radical (man)
Beispiel #25
0
def test_yaml_config_and_override(test_config_file):
    app = Cihai.from_file(test_config_file)

    assert app.config['database']
Beispiel #26
0
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as elemTree
import os, uuid
from operator import methodcaller, itemgetter, mul
from cihai.core import Cihai

c = Cihai()

import hms.tex as tex
import hms.html as html
import multiprocessing as mp
from multiprocessing import Process, Lock, Queue, Value


def generateIdent():
    return str(uuid.uuid4())


def textify(e, spell, ident, coder=tex):
    total = ''
    part = e.text
    beforehand = False
    for child in e:
        if part != None:
            part = part.replace('.', '。').replace(',', ',').replace(
                ' ', '').replace('\t', '').replace('\n', '').replace(' ', '')
            if part != '':
                total += part
                beforehand = False
        if child.tag == 'quote':
            temp = textify(child, spell, ident, coder)