# -*- coding: utf-8 -*- import pycantonese as pc import csv corpus = pc.hkcancor() hkcancorFreq = corpus.word_frequency() with open('hkcancorFrequency.csv', encoding='utf-8', mode='w', newline='') as csvfile: writer = csv.writer(csvfile) for key, value in hkcancorFreq.items(): writer.writerow([key, value]) sun = pc.read_chat('../00-source/sun_1924_tagged.cha') sun.word_frequency() sunFreq = sun.word_frequency() with open('sunFrequency.csv', encoding='utf-8', mode='w', newline='') as csvfile: writer = csv.writer(csvfile) for key, value in sunFreq.items(): writer.writerow([key, value])
import sys import pytest import pycantonese HKCANCOR = pycantonese.hkcancor() def almost_equal(x, y, tolerance): # Don't bother to import numpy's assert_almost_equal just for testing return abs(x - y) <= tolerance def test_hkcancor_word_count(): assert almost_equal(len(HKCANCOR.words()), 149781, tolerance=3) @pytest.mark.skipif(sys.version_info[0] == 2, reason='character/unicode parsing not yet fixed ' 'for python 2.7') def test_hkcancor_character_count(): assert almost_equal(len(HKCANCOR.characters()), 186888, tolerance=3)
import sys from collections import Counter import json from memoize import memoize import pinyin import jyutping import pycantonese corpus = pycantonese.hkcancor() #from hanziconv import HanziConv from opencc import OpenCC s2hk = OpenCC('s2hk').convert from mkdict import pinyin_to_zhuyin_real as pinyin_to_zhuyin from mkdict import get_all_yue, get_merged_entries def get_contents_in_dictionary(dictfile): lines = open(dictfile).readlines() output = [] is_started = False for line in lines: x = line.strip() if x == '...': is_started = True continue if not is_started: continue output.append(line)
def main(): corpus = hkcancor() tagger = POSTagger(**_TAGGER_PARAMETERS) tagger.train(_get_tagged_sents(corpus), save=_PICKLE_PATH)
#loading packages import pycantonese as pc import xlsxwriter import collections print("Please read the code and change directories accordingly (see line 9).") #setting up workbook workbook = xlsxwriter.Workbook('hkcancor_word.xlsx') worksheet = workbook.add_worksheet('words') #loading files c = pc.hkcancor() #getting words tokens = c.search(tone='[1-6]') #word frequency word_count = collections.Counter(tokens) #writing data row = 1 worksheet.write(0, 0, 'word') worksheet.write(0, 1, 'jyutping') worksheet.write(0, 2, 'occured') worksheet.write(0, 3, 'ratio') word_sum = sum(word_count.values()) for i in word_count.keys(): worksheet.write(row, 0, i[0]) worksheet.write(row, 1, i[2])
def _get_tagged_sents(): return [ [(token.word, _FIX_HKCANCOR_TAGS.get(token.pos, token.pos)) for token in tokens] for tokens in hkcancor().tokens(by_utterances=True) ]