def test_z2h(): assert_equal(jctconv.z2h('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jctconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(partial(jctconv.z2h, kana=True), FULL_KANA, HALF_KANA) _compare(partial(jctconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII) _compare(partial(jctconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT) assert_equal(jctconv.z2h(_concat(FULL_KANA, FULL_ASCII, FULL_DIGIT), ascii=True, digit=True, kana=True), _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
def test_z2h(): assert_equal(jctconv.z2h('ティロフィナーレ'), 'ティロフィナーレ') assert_equal(jctconv.z2h('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ') _compare(partial(jctconv.z2h, kana=True), FULL_KANA, HALF_KANA) _compare(partial(jctconv.z2h, ascii=True), FULL_ASCII, HALF_ASCII) _compare(partial(jctconv.z2h, digit=True), FULL_DIGIT, HALF_DIGIT) for ascii in (True, False): for digit in (True, False): for kana in (True, False): assert_equal( jctconv.z2h(_concat(FULL_KANA if kana else HALF_KANA, FULL_ASCII if ascii else HALF_ASCII, FULL_DIGIT if digit else HALF_DIGIT), ascii=ascii, digit=digit, kana=kana), _concat(HALF_KANA, HALF_ASCII, HALF_DIGIT))
def convert_two_digit(string): '''convert two full-width digit into half width digit.''' patterns = re.findall(r'[^0-9][0-9]{2}[^0-9]', string) patterns += re.findall(r'^[0-9]{2}[^0-9]', string) for p in patterns: string = re.sub(p, jctconv.z2h(p, digit=True), string) return string
# -*- coding: utf-8 -*- import urllib2, time, sys, jctconv, pickle #wikipediaのhtmlをクロールします i = 0 f = open("synonym_dict.txt") f2 = open("synonym_dict2.txt", "w") synonym_dict = pickle.load(f) for abb_pair_list in open(sys.argv[1], "r"): i += 1 print i #入力するcsvファイルによってtarget_titleが何番目のフィールドを参照するか変わるので毎回添字をかえる必要がある target_title_decode = abb_pair_list.strip().split(",")[1].decode("utf-8") target_title_encode = jctconv.z2h(target_title_decode, kana=False, digit=True, ascii=True).encode("utf-8") if not target_title_encode in synonym_dict: print "new_key" url_text = "http://ja.wikipedia.org/wiki/%s" % target_title_encode try: url_html = urllib2.urlopen(url_text).read() synonym_dict[target_title_encode] = url_html print "url_succeed" time.sleep(10.0) except: synonym_dict[target_title_encode] = "" time.sleep(10.0) continue pickle.dump(synonym_dict, f2) f.close()
if despacing: # e.g. use case: before japanese tokenization, remove suspect white space # note that wide comma normalization occurs here despace(line) if verbose>1: sys.stderr.write('despace '+repr(lno)+": "+line.encode('utf-8')+"\n"); if widecase: # wide-casing for specified classes line = h2z(line,kana=jctkana,digit=jctsym,ascii=jctalpha) if verbose>1: sys.stderr.write('wide '+repr(lno)+": "+line.encode('utf-8')+"\n"); if narrowcase: # narrow-casing for specified classes line = z2h(line,kana=jctkana,digit=jctsym,ascii=jctalpha) if verbose>1: sys.stderr.write('narrow '+repr(lno)+": "+line.encode('utf-8')+"\n"); # tokenize using pipe ptok.sendline(line.encode('utf-8')) if verbose>1: sys.stderr.write('reading...\n') line = ptok.readline() if not line: sys.stderr.write('tokenizer EOF unexpected at line '+repr(lno)+"\n") break if verbose>1: sys.stderr.write('tokenized '+repr(lno)+": "+line);
def test_z2h(): assert_equal(jctconv.z2h(u'ティロフィナーレ'), u'ティロフィナーレ') assert_equal(jctconv.z2h(FULL_KANA), HALF_KANA) assert_equal(jctconv.z2h(FULL_ASCII, mode='ASCII'), HALF_ASCII) assert_equal(jctconv.z2h(FULL_DIGIT, mode='DIGIT'), HALF_DIGIT)