def __init__(self, conversion, train): cs=conversion.upper().strip(',:|;').split(';') self.failed=False self.conversions=[] self.evaluaters=[] self.trainer=None self.cache={} self.lastc=0 self.cinfo=None self.tempfile=None done={} if train: self.tempfile=mktemp('/tmp/gbsdconv.score.XXXXXX') os.unlink(self.tempfile[1]) self.trainer=Bsdconv('LOCALE:HALF:LOWER:ZH-FUZZY-TW:SCORE-TRAIN:NULL') self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) for c in cs: if c in done: continue done[c]=1 c1=c.replace(':SCORE:',':') c2=c.replace(':SCORE:',':HALF:LOWER:ZH-FUZZY-TW:SCORE:COUNT:') h1=Bsdconv(c1) h2=Bsdconv(c2) if not h1 or not h2: self.errorstr=Bsdconv.error() self.failed=True self.conversions=[Bsdconv('BYTE:BYTE')] else: if train: h2.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) self.conversions.append(h1) self.evaluaters.append(h2)
def _print_group2(self, g2): c = Bsdconv("bsdconv:utf-8") print("#Layer 2 Group {0}\n".format(g2)) print("##Attributes") a = self.layers[1][1].get(g2) for cat in a: v = a[cat] print("* {0}: {1} ({2})".format(cat, c.conv(p01(v)), v)) print("##Member") for g in self.layers[1][0].data.get(g2): self._print_group1(g)
def transliterate(self, k, cat): ret = k gk = k c = Bsdconv("bsdconv:{0}".format(cat)) c.conv(p01(k)) if not c.counter("OERR"): return ret for layer in self.layers: gk = layer[0].rdata.get(gk) t = layer[1].get(gk).get(cat, None) if t: ret = t break else: gk = layer[0].rdata.get(gk, None) return ret
def _print_group1(self, g): c = Bsdconv("bsdconv:utf-8") print("###Layer 1 Group {0}\n".format(g)) print("####Attributes") a = self.layers[0][1].get(g) for cat in a: v = a[cat] print(" * {0}: {1} ({2})".format(cat, c.conv(p01(v)), v)) print("") print("####Member") d = self.layers[0][0].data.get(g) for e in d: print(" * {0} ({1})".format(c.conv(p01(e)), e)) print("")
#!/usr/bin/env python import sys from bsdconv import Bsdconv p = Bsdconv(sys.argv[1]) if not p: print(Bsdconv.error()) del p sys.exit() p.conv_file(sys.argv[2], sys.argv[3]) print(p) print(p.counter()) del p
class Bsdconvs(object): def __init__(self, conversion, train): cs=conversion.upper().strip(',:|;').split(';') self.failed=False self.conversions=[] self.evaluaters=[] self.trainer=None self.cache={} self.lastc=0 self.cinfo=None self.tempfile=None done={} if train: self.tempfile=mktemp('/tmp/gbsdconv.score.XXXXXX') os.unlink(self.tempfile[1]) self.trainer=Bsdconv('LOCALE:HALF:LOWER:ZH-FUZZY-TW:SCORE-TRAIN:NULL') self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) for c in cs: if c in done: continue done[c]=1 c1=c.replace(':SCORE:',':') c2=c.replace(':SCORE:',':HALF:LOWER:ZH-FUZZY-TW:SCORE:COUNT:') h1=Bsdconv(c1) h2=Bsdconv(c2) if not h1 or not h2: self.errorstr=Bsdconv.error() self.failed=True self.conversions=[Bsdconv('BYTE:BYTE')] else: if train: h2.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) self.conversions.append(h1) self.evaluaters.append(h2) def __bool__(self): return not self.failed def __nonzero__(self): return self.__bool__() def __str__(self): return ';'.join([x.split('"')[1] for x in [str(x) for x in self.conversions]]) def weighted_score(self, i): ierr=i.get("IERR", 0) oerr=i.get("OERR", 0) score=i.get("SCORE", 0) count=i.get("COUNT", 0) if count==0: return 0 return float(score - (ierr + oerr) * 10) / float(count) def score_train(self, s): if self.trainer: self.trainer.testconv(s) def score_clear(self): if self.trainer: self.tempfile=mktemp("/tmp/gbsdconv.score.XXXXXX") os.unlink(self.tempfile[1]) self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) for c in self.evaluaters: c.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0) def conv(self, s): self.cinfo=None for k,c in enumerate(self.evaluaters): c.testconv(s) score=self.weighted_score(c.counter()) if k==0: self.lastc=0 max_score=score elif score>max_score: self.lastc=k max_score=score return self.conversions[self.lastc].conv(s) def conv_list(self, a): for k,c in enumerate(self.evaluaters): score=0 for s in a: c.testconv(s) i=c.counter() score+=self.weighted_score(c.counter()) if k==0: self.lastc=0 max_score=score elif score>max_score: self.lastc=k max_score=score ret=[] for k,s in enumerate(a): ret.append(self.conversions[self.lastc].conv(s)) if k==0: self.cinfo=self.conversions[self.lastc].counter() else: n=self.conversions[self.lastc].counter() self.cinfo={x:n[x]+self.cinfo[x] for x in n} return ret def conv_file(self, ifile, ofile): self.cinfo=None for k,c in enumerate(self.evaluaters): c.testconv_file(ifile) score=self.weighted_score(c.counter()) if k==0: self.lastc=0 max_score=score elif score>max_score: self.lastc=k max_score=score self.conversions[self.lastc].conv_file(ifile, ofile) return def testconv_file(self, str): for k,c in enumerate(self.evaluaters): c.testconv_file(str) score=self.weighted_score(c.counter()) if k==0: self.lastc=0 max_score=score elif score>max_score: self.lastc=k max_score=score self.cinfo=self.evaluaters[self.lastc].counter() return def counter(self): if self.cinfo: return self.cinfo return self.conversions[self.lastc].counter() def error(self): return self.errorstr
}], ["utf-8:count:null", "123Б測試", { "COUNT": 6 }], ["utf-8:count#blah:null", "123Б測試", { "BLAH": 6 }], ["utf-8:count#for=lala&for=cjk:null", "123Б測a試bc", { "COUNT": 2 }], ] passed = True for c, i, o in iotest: p = Bsdconv(c) if not p: print(Bsdconv.error()) print("Test failed at %s" % repr([c, i, o])) del p passed = False continue r = p.conv(i) if o != r: print("Test failed at %s" % repr([c, i, o])) print("expected(%d): %s" % (len(o), repr(o))) print("result(%d): %s" % (len(r), repr(r))) passed = False del p for c, d, i in countertest:
async def comp_handler(request): c = Bsdconv("bsdconv:zh-comp:utf-8") r = c.conv(request.GET["q"]).decode("utf-8") return web.json_response({"result":r})
#!/usr/bin/env python #mkbonus.py src_list char_list phrase_list import sys import re from bsdconv import Bsdconv clist=open(sys.argv[2], "w") plist=open(sys.argv[3], "w") sc=Bsdconv("utf-8:score#with=cjk:null") bcv=Bsdconv("utf-8:insert#after=002c:bsdconv-keyword,bsdconv") bcv_zhtw=Bsdconv("utf-8:zhtw:insert#after=002c:bsdconv-keyword,bsdconv") sep=re.compile(r"\s+") f=open(sys.argv[1]) for l in f: l = l.strip() if l == "": continue if l.startswith("#"): clist.write(l+"\n") plist.write(l+"\n") a = sep.split(l) p = a[0] ln = len(p.decode("utf-8")) if ln > 1: bonus = 6 p = bcv_zhtw.conv(p).rstrip(",")
#!/usr/bin/env python import sys from bsdconv import Bsdconv p=Bsdconv(sys.argv[1]) if not p: print(Bsdconv.error()) del p sys.exit() p.init() s=sys.stdin.read(1024) while s: print(p.conv_chunk(s),) s=sys.stdin.read(1024) print(p.conv_chunk_last(s)) print('====================================') print(p.counter()) del p
# -*- coding: utf-8 -*- # python nfkc_gen.py '⁰¹²³'|sort|uniq import sys from bsdconv import Bsdconv nfkc = Bsdconv("utf-8:nfkc:utf-8") i = sys.argv[1].decode("utf-8") for c in i: c = c.encode("utf-8") d = nfkc.conv(c) if c==d: continue print("{}\t{}".format(d, c))
#!/usr/bin/env python import sys from bsdconv import Bsdconv p = Bsdconv(sys.argv[1]) if not p: print(Bsdconv.error()) del p sys.exit() p.init() s = sys.stdin.read(1024) while s: print(p.conv_chunk(s), ) s = sys.stdin.read(1024) print(p.conv_chunk_last(s)) print('====================================') print(p.counter()) del p
#!/usr/bin/env python import sys from bsdconv import Bsdconv p=Bsdconv(sys.argv[1]) if not p: print(Bsdconv.error()) del p sys.exit() p.conv_file(sys.argv[2], sys.argv[3]) print(p) print(p.counter()) del p
#!/usr/bin/env python import os import sys from bsdconv import Bsdconv a=Bsdconv.mktemp("score.XXXXXX") os.unlink(a[1]) clist=Bsdconv.fopen("characters_list.txt","w+") p=Bsdconv("utf-8:score-train:null") if not p: print(Bsdconv.error()) del p sys.exit() p.ctl(Bsdconv.CTL_ATTACH_SCORE, a[0], 0) p.ctl(Bsdconv.CTL_ATTACH_OUTPUT_FILE, clist, 0) p.init() f=open(sys.argv[1]) s=f.read(1024) while s: p.conv_chunk(s), s=f.read(1024) p.conv_chunk_last(s) f.close()
#!/usr/bin/env python from bsdconv import Bsdconv sin="utf-8:utf-8,ascii" sout=Bsdconv.insert_phase(sin, "upper", Bsdconv.INTER, 1) print(sout) sin=sout sout=Bsdconv.replace_phase(sin, "full", Bsdconv.INTER, 1) print(sout) sin=sout sout=Bsdconv.replace_codec(sin, "big5", 2, 1) print(sout) sin=sout sout=Bsdconv.insert_codec(sin, "ascii", 0, 1) print(sout)
#!/usr/bin/env python import sys from bsdconv import Bsdconv p=Bsdconv(sys.argv[1]) if not p: print(Bsdconv.error()) del p sys.exit() s=sys.stdin.read() print(p.conv(s)) print('====================================') print(p.counter()) del p
#!/usr/bin/env python from bsdconv import Bsdconv sin = "utf-8:utf-8,ascii" sout = Bsdconv.insert_phase(sin, "upper", Bsdconv.INTER, 1) print(sout) sin = sout sout = Bsdconv.replace_phase(sin, "full", Bsdconv.INTER, 1) print(sout) sin = sout sout = Bsdconv.replace_codec(sin, "big5", 2, 1) print(sout) sin = sout sout = Bsdconv.insert_codec(sin, "ascii", 0, 1) print(sout)
#!/usr/bin/env python import os import sys from bsdconv import Bsdconv a = Bsdconv.mktemp("score.XXXXXX") os.unlink(a[1]) clist = Bsdconv.fopen("characters_list.txt", "w+") p = Bsdconv("utf-8:score-train:null") if not p: print(Bsdconv.error()) del p sys.exit() p.ctl(Bsdconv.CTL_ATTACH_SCORE, a[0], 0) p.ctl(Bsdconv.CTL_ATTACH_OUTPUT_FILE, clist, 0) p.init() f = open(sys.argv[1]) s = f.read(1024) while s: p.conv_chunk(s), s = f.read(1024) p.conv_chunk_last(s) f.close()
import sys import re import itertools from bsdconv import Bsdconv from pyquery import PyQuery as pq def bsdconv01(dt): dt=dt.strip().lstrip("0").upper() if len(dt) & 1: return "010"+dt else: return "01"+dt bcv = Bsdconv("UTF-8:INSERT#AFTER=002C:BSDCONV-KEYWORD,BSDCONV") bcv1252 = Bsdconv("CP1252:INSERT#AFTER=002C:BSDCONV-KEYWORD,BSDCONV") d = pq(open(sys.argv[1]).read()) fonttype = {} fontdata = {} for font in d.find("font"): font = pq(font) ftype = font.attr("type") fonttype[ftype] = {"hidden": font.attr("hidden"), "inherit": font.attr("inherit")} for m in itertools.chain(font.find("global").find("map"), font.find("tounicode").find("map")): m = pq(m) if ftype not in fontdata: fontdata[ftype] = {} fontdata[ftype][m.attr("legacy")]=m.attr("unicode")
#!/usr/bin/env python import sys from bsdconv import Bsdconv print(Bsdconv.module_check(Bsdconv.FROM, "_utf-8")) print(Bsdconv.module_check(Bsdconv.INTER, "_utf-8")) print('Filter:') print(Bsdconv.modules_list(Bsdconv.FILTER)) print('From:') print(Bsdconv.modules_list(Bsdconv.FROM)) print('Inter:') print(Bsdconv.modules_list(Bsdconv.INTER)) print('To:') print(Bsdconv.modules_list(Bsdconv.TO))
#!/usr/bin/env python import sys from bsdconv import Bsdconv print(Bsdconv.module_check(Bsdconv.FROM,"_utf-8")) print(Bsdconv.module_check(Bsdconv.INTER,"_utf-8")) print('Filter:') print(Bsdconv.modules_list(Bsdconv.FILTER)) print('From:') print(Bsdconv.modules_list(Bsdconv.FROM)) print('Inter:') print(Bsdconv.modules_list(Bsdconv.INTER)) print('To:') print(Bsdconv.modules_list(Bsdconv.TO))
async def decomp_handler(request): c = Bsdconv("utf-8:zh-decomp:split:bsdconv-keyword,bsdconv") a = c.conv(request.GET["q"]).decode("utf-8").strip(",").split(",") return web.json_response(a)
# -*- coding: utf-8 -*- # python nfkc_gen.py '⁰¹²³'|sort|uniq import sys from bsdconv import Bsdconv nfkc = Bsdconv("utf-8:nfkc:utf-8") i = sys.argv[1].decode("utf-8") for c in i: c = c.encode("utf-8") d = nfkc.conv(c) if c == d: continue print("{}\t{}".format(d, c))
async def info_handler(request): cv = Bsdconv("utf-8:cns11643:bsdconv") us = list(request.GET["q"]) cs = [cv.conv(x).decode("utf-8") for x in us] r = [{"u":u,"p":c[2:4].lstrip('0'),"c":c[4:]} for u,c in zip(us, cs)] return web.json_response(r)
["_utf-8#overlong,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc1\xbf,\xe0\x9f\xbf,\xf0\x8f\xbf\xbf,\xf8\x87\xbf\xbf\xbf,\xfc\x83\xbf\xbf\xbf\xbf", "017F,012C,0107FF,012C,01FFFF,012C,013F,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,013F,"], ["_utf-8,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc0\x80,\xe0\x80\x80,\xf0\x80\x80\x80,\xf8\x80\x80\x80\x80,\xfc\x80\x80\x80\x80\x80", "013F,013F,012C,013F,013F,013F,012C,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,013F,"], ["_utf-8#nul&overlong&super,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc0\x80,\xe0\x80\x80,\xf0\x80\x80\x80,\xf8\x80\x80\x80\x80,\xfc\x80\x80\x80\x80\x80", "0100,012C,0100,012C,0100,012C,0100,012C,0100,"], ] countertest=[ ["utf-8:width:null","123Б測試",{"FULL":2,"AMBI":1,"HALF":3}], ["utf-8:count:null","123Б測試",{"COUNT":6}], ["utf-8:count#blah:null","123Б測試",{"BLAH":6}], ["utf-8:count#for=lala&for=cjk:null","123Б測a試bc",{"COUNT":2}], ] passed=True for c, i, o in iotest: p=Bsdconv(c) if not p: print(Bsdconv.error()) print("Test failed at %s" % repr([c, i, o])) del p passed=False continue r=p.conv(i) if o != r: print("Test failed at %s" % repr([c, i, o])) print("expected(%d): %s" % (len(o), repr(o))) print("result(%d): %s" % (len(r), repr(r))) passed=False del p for c, d, i in countertest:
#!/usr/bin/env python #mkbonus.py src_list char_list phrase_list import sys import re from bsdconv import Bsdconv clist=open(sys.argv[2], "w") plist=open(sys.argv[3], "w") sc=Bsdconv("utf-8:score#default:null") bcv=Bsdconv("utf-8:insert#after=002c:bsdconv-keyword,bsdconv") bcv_zhtw=Bsdconv("utf-8:zhtw:insert#after=002c:bsdconv-keyword,bsdconv") sep=re.compile(r"\s+") f=open(sys.argv[1]) for l in f: l = l.strip() if l == "": continue if l.startswith("#"): clist.write(l+"\n") plist.write(l+"\n") a = sep.split(l) p = a[0] ln = len(p.decode("utf-8")) if ln > 1: bonus = 6 p = bcv_zhtw.conv(p).rstrip(",")
class Crawler(object): convert = Bsdconv("ansi-control,byte:big5-defrag:byte,ansi-control|skip,big5:utf-8,bsdconv_raw") def __init__(self, host): self.host = host self.delay = 0 self.conn = Telnet(host, 3456) self.screen = pyte.Screen(80, 24) self.stream = pyte.Stream() self.screen.mode.discard(pyte.modes.LNM) self.stream.attach(self.screen) self.display self.login() self.enter_board('NCTU-Teacher') for i in range(input('n - ' + self.last_id + ': '), int(self.last_id) + 1): self.get_article(i) @property def display(self): s = self.conn.read_very_eager() while not s: s = self.conn.read_very_eager() time.sleep(self.delay) s = self.convert.conv(s) self.stream.feed(s.decode('utf-8')) return self.screen_shot @property def screen_shot(self): return "\n".join(self.screen.display).encode("utf-8") def close(self): self.conn.close() def send_enter(self, count=1): for i in range(count): s = self.send('\r') if count == 1: return s def send(self, s): self.conn.write(s) ret = self.display return ret def login(self): username = '******' self.conn.write(username + '\r') self.conn.write('\rYY\r') self.send_enter(2) def enter_board(self, board): ''' Save current board name in self.board and lastest article_id in self.last_id ''' self.send('OBOC') self.send('s{}'.format(board)) self.send_enter(2) line = self.screen.cursor.y self.last_id = re.search(r'(?P<last_id>^\d+) ', self.screen.display[line].strip()).group() self.board = board def get_article(self, num=None): if not num: return self.send('{}\rOC'.format(num)) raw_artcle = self.screen.display[:-1] status_line = self.screen.display[-1] if status_line.find('[Y/n]') != -1: self.send('n') while status_line.find('(100%)') == -1: self.send('OB') status_line = self.screen.display[-1] raw_artcle.append(self.screen.display[-2]) self.save_article(num, raw_artcle) def term_comm(feed=None, wait=None): if feed != None: self.conn.write(feed) if wait: s = self.conn.read_some() s = self.convert.conv_chunk(s) self.stream.feed(s.decode("utf-8")) if wait != False: time.sleep(0.1) s = self.conn.read_very_eager() s = self.convert.conv_chunk(s) self.stream.feed(s.decode("utf-8")) ret = "\n".join(self.screen.display).encode("utf-8") return ret def save_article(self, num, content): ''' :param content: a list get from screen ''' chinese_keyword = { 'board': '看板', } author_line = content[0].encode('utf-8').split() if not chinese_keyword['board'] in author_line: return _i = author_line.index(chinese_keyword['board']) author = ' '.join(author_line[1:_i]) title_line = content[1].encode('utf-8').split()[1:] title = ' '.join(title_line) time_line = content[2].encode('utf-8').split()[1:] time = ' '.join(time_line) if not time.find('(') == -1: time = time[time.find('(') + 1:time.find(')')] time = time.split() time.pop(1) time = ' '.join(time) print time article = '\n'.join(content[3:]).encode('utf-8') try: post = Teacher.get(bbs_id=num) post.content = article post.save() logger.info('Update: {id}'.format(id=num)) except Teacher.DoesNotExist: post = Teacher.create(author=author, title=title, pub_time=time, content=article, bbs_id=num ) logger.info('Insert: {id}'.format(id=num))