Ejemplo n.º 1
0
from wordfreq import zipf_frequency

from database import database, objects

import logging
import sqlite3
import sys

sources = [
    objects.SourceTuple(
        "CC-CEDICT",
        "CC",
        "2022-02-07",
        "CC-CEDICT is a continuation of the CEDICT project started by Paul Denisowski in 1997 with the aim to provide a complete downloadable Chinese to English dictionary with pronunciation in pinyin for the Chinese characters.",
        "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.",
        "http://www.mdbg.net/chindict/chindict.php?page=cc-cedict",
        "",
        "",
    ),
    objects.SourceTuple(
        "CC-CANTO",
        "CCY",
        "2017-02-02",
        "CC-Canto is an open-source Cantonese-to-English dictionary with about 22,000 entries, designed to be used alongside CC-CEDICT.",
        "This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License.",
        "http://cantonese.org/download.html",
        "",
        "",
    ),
]
Ejemplo n.º 2
0
               "<source version> <source description> <source legal> "
               "<source link> <source update url> <source other>"))
        print((
            "e.g. python3 script.py moedict.db ./dict-revised.json "
            '"Ministry of Education Dictionary (MoEDict)" MOE 2021-08-06 '
            '"本典為一部歷史語言辭典,記錄中古至現代各類詞語,並大量引用古典文獻書證,字 音部分則兼收現代及傳統音讀。" '
            '"中華民國教育部《重編國語辭典修訂本》資料採「創用CC-姓名標示- 禁止改作 3.0 臺灣授權條款」釋出'
            '本授權條款允許使用者重製、散布、傳輸著作(包括商業性利用),但不得修改該著作,使用時必須遵照「使用說明」之內容要求。" '
            '"https://language.moe.gov.tw/001/Upload/Files/site_content/M0001/respub/dict_reviseddict_download.html" "" "words,sentences"'
        ))
        sys.exit(1)

    cc_cedict.load()

    source = objects.SourceTuple(
        sys.argv[3],
        sys.argv[4],
        sys.argv[5],
        sys.argv[6],
        sys.argv[7],
        sys.argv[8],
        sys.argv[9],
        sys.argv[10],
    )

    logging.getLogger().setLevel(logging.INFO)

    words = []
    parse_file(sys.argv[2], words)
    write(sys.argv[1], source, words)
Ejemplo n.º 3
0
    chinese_sentences = {}  # Use this to store all the source sentences
    nonchinese_sentences = {}  # Use this to store all the target sentences
    intermediate_ids = set()  # Use this to store ids of sentences between source/target
    chinese_sentences_filtered = (
        {}
    )  # Store only source sentences that match a target sentence
    nonchinese_sentences_filtered = (
        {}
    )  # Store only target sentences that match a source sentence
    links = {}  # Use this to store all the links between sentences
    source = objects.SourceTuple(
        sys.argv[6],
        sys.argv[7],
        sys.argv[8],
        sys.argv[9],
        sys.argv[10],
        sys.argv[11],
        sys.argv[12],
        sys.argv[13],
    )
    parse_sentence_file(
        sys.argv[2],
        sys.argv[4],
        sys.argv[5],
        chinese_sentences,
        nonchinese_sentences,
        intermediate_ids,
        enable_jyutping,
        enable_pinyin,
    )
    parse_links_file(