Beispiel #1
0
	def __init__(self, conversion, train):
		cs=conversion.upper().strip(',:|;').split(';')
		self.failed=False
		self.conversions=[]
		self.evaluaters=[]
		self.trainer=None
		self.cache={}
		self.lastc=0
		self.cinfo=None
		self.tempfile=None
		done={}
		if train:
			self.tempfile=mktemp('/tmp/gbsdconv.score.XXXXXX')
			os.unlink(self.tempfile[1])
			self.trainer=Bsdconv('LOCALE:HALF:LOWER:ZH-FUZZY-TW:SCORE-TRAIN:NULL')
			self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)
		for c in cs:
			if c in done:
				continue
			done[c]=1
			c1=c.replace(':SCORE:',':')
			c2=c.replace(':SCORE:',':HALF:LOWER:ZH-FUZZY-TW:SCORE:COUNT:')
			h1=Bsdconv(c1)
			h2=Bsdconv(c2)
			if not h1 or not h2:
				self.errorstr=Bsdconv.error()
				self.failed=True
				self.conversions=[Bsdconv('BYTE:BYTE')]
			else:
				if train:
					h2.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)
				self.conversions.append(h1)
				self.evaluaters.append(h2)
Beispiel #2
0
	def _print_group2(self, g2):
		c = Bsdconv("bsdconv:utf-8")

		print("#Layer 2 Group {0}\n".format(g2))

		print("##Attributes")
		a = self.layers[1][1].get(g2)
		for cat in a:
			v = a[cat]
			print("* {0}: {1} ({2})".format(cat, c.conv(p01(v)), v))

		print("##Member")
		for g in self.layers[1][0].data.get(g2):
			self._print_group1(g)
Beispiel #3
0
	def transliterate(self, k, cat):
		ret = k
		gk = k
		c = Bsdconv("bsdconv:{0}".format(cat))
		c.conv(p01(k))
		if not c.counter("OERR"):
			return ret
		for layer in self.layers:
			gk = layer[0].rdata.get(gk)
			t = layer[1].get(gk).get(cat, None)
			if t:
				ret = t
				break
			else:
				gk = layer[0].rdata.get(gk, None)
		return ret
Beispiel #4
0
	def _print_group1(self, g):
		c = Bsdconv("bsdconv:utf-8")

		print("###Layer 1 Group {0}\n".format(g))

		print("####Attributes")
		a = self.layers[0][1].get(g)
		for cat in a:
			v = a[cat]
			print("  * {0}: {1} ({2})".format(cat, c.conv(p01(v)), v))

		print("")

		print("####Member")
		d = self.layers[0][0].data.get(g)
		for e in d:
			print("  * {0} ({1})".format(c.conv(p01(e)), e))

		print("")
Beispiel #5
0
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p = Bsdconv(sys.argv[1])
if not p:
    print(Bsdconv.error())
    del p
    sys.exit()
p.conv_file(sys.argv[2], sys.argv[3])
print(p)
print(p.counter())
del p
Beispiel #6
0
class Bsdconvs(object):
	def __init__(self, conversion, train):
		cs=conversion.upper().strip(',:|;').split(';')
		self.failed=False
		self.conversions=[]
		self.evaluaters=[]
		self.trainer=None
		self.cache={}
		self.lastc=0
		self.cinfo=None
		self.tempfile=None
		done={}
		if train:
			self.tempfile=mktemp('/tmp/gbsdconv.score.XXXXXX')
			os.unlink(self.tempfile[1])
			self.trainer=Bsdconv('LOCALE:HALF:LOWER:ZH-FUZZY-TW:SCORE-TRAIN:NULL')
			self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)
		for c in cs:
			if c in done:
				continue
			done[c]=1
			c1=c.replace(':SCORE:',':')
			c2=c.replace(':SCORE:',':HALF:LOWER:ZH-FUZZY-TW:SCORE:COUNT:')
			h1=Bsdconv(c1)
			h2=Bsdconv(c2)
			if not h1 or not h2:
				self.errorstr=Bsdconv.error()
				self.failed=True
				self.conversions=[Bsdconv('BYTE:BYTE')]
			else:
				if train:
					h2.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)
				self.conversions.append(h1)
				self.evaluaters.append(h2)

	def __bool__(self):
		return not self.failed

	def __nonzero__(self):
		return self.__bool__()

	def __str__(self):
		return ';'.join([x.split('"')[1] for x in [str(x) for x in self.conversions]])

	def weighted_score(self, i):
		ierr=i.get("IERR", 0)
		oerr=i.get("OERR", 0)
		score=i.get("SCORE", 0)
		count=i.get("COUNT", 0)
		if count==0:
			return 0
		return float(score - (ierr + oerr) * 10) / float(count)

	def score_train(self, s):
		if self.trainer:
			self.trainer.testconv(s)

	def score_clear(self):
		if self.trainer:
			self.tempfile=mktemp("/tmp/gbsdconv.score.XXXXXX")
			os.unlink(self.tempfile[1])
			self.trainer.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)
			for c in self.evaluaters:
				c.ctl(CTL_ATTACH_SCORE, self.tempfile[0], 0)

	def conv(self, s):
		self.cinfo=None
		for k,c in enumerate(self.evaluaters):
			c.testconv(s)
			score=self.weighted_score(c.counter())
			if k==0:
				self.lastc=0
				max_score=score
			elif score>max_score:
				self.lastc=k
				max_score=score
		return self.conversions[self.lastc].conv(s)

	def conv_list(self, a):
		for k,c in enumerate(self.evaluaters):
			score=0
			for s in a:
				c.testconv(s)
				i=c.counter()
				score+=self.weighted_score(c.counter())
			if k==0:
				self.lastc=0
				max_score=score
			elif score>max_score:
				self.lastc=k
				max_score=score
		ret=[]
		for k,s in enumerate(a):
			ret.append(self.conversions[self.lastc].conv(s))
			if k==0:
				self.cinfo=self.conversions[self.lastc].counter()
			else:
				n=self.conversions[self.lastc].counter()
				self.cinfo={x:n[x]+self.cinfo[x] for x in n}
		return ret

	def conv_file(self, ifile, ofile):
		self.cinfo=None
		for k,c in enumerate(self.evaluaters):
			c.testconv_file(ifile)
			score=self.weighted_score(c.counter())
			if k==0:
				self.lastc=0
				max_score=score
			elif score>max_score:
				self.lastc=k
				max_score=score
		self.conversions[self.lastc].conv_file(ifile, ofile)
		return

	def testconv_file(self, str):
		for k,c in enumerate(self.evaluaters):
			c.testconv_file(str)
			score=self.weighted_score(c.counter())
			if k==0:
				self.lastc=0
				max_score=score
			elif score>max_score:
				self.lastc=k
				max_score=score
		self.cinfo=self.evaluaters[self.lastc].counter()
		return

	def counter(self):
		if self.cinfo:
			return self.cinfo
		return self.conversions[self.lastc].counter()

	def error(self):
		return self.errorstr
Beispiel #7
0
    }],
    ["utf-8:count:null", "123Б測試", {
        "COUNT": 6
    }],
    ["utf-8:count#blah:null", "123Б測試", {
        "BLAH": 6
    }],
    ["utf-8:count#for=lala&for=cjk:null", "123Б測a試bc", {
        "COUNT": 2
    }],
]

passed = True

for c, i, o in iotest:
    p = Bsdconv(c)
    if not p:
        print(Bsdconv.error())
        print("Test failed at %s" % repr([c, i, o]))
        del p
        passed = False
        continue
    r = p.conv(i)
    if o != r:
        print("Test failed at %s" % repr([c, i, o]))
        print("expected(%d): %s" % (len(o), repr(o)))
        print("result(%d): %s" % (len(r), repr(r)))
        passed = False
    del p

for c, d, i in countertest:
Beispiel #8
0
async def comp_handler(request):
    c = Bsdconv("bsdconv:zh-comp:utf-8")
    r = c.conv(request.GET["q"]).decode("utf-8")
    return web.json_response({"result":r})
Beispiel #9
0
#!/usr/bin/env python

#mkbonus.py src_list char_list phrase_list

import sys
import re
from bsdconv import Bsdconv

clist=open(sys.argv[2], "w")
plist=open(sys.argv[3], "w")

sc=Bsdconv("utf-8:score#with=cjk:null")
bcv=Bsdconv("utf-8:insert#after=002c:bsdconv-keyword,bsdconv")
bcv_zhtw=Bsdconv("utf-8:zhtw:insert#after=002c:bsdconv-keyword,bsdconv")

sep=re.compile(r"\s+")

f=open(sys.argv[1])
for l in f:
	l = l.strip()
	if l == "":
		continue
	if l.startswith("#"):
		clist.write(l+"\n")
		plist.write(l+"\n")
	a = sep.split(l)
	p = a[0]
	ln = len(p.decode("utf-8"))
	if ln > 1:
		bonus = 6
		p = bcv_zhtw.conv(p).rstrip(",")
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p=Bsdconv(sys.argv[1])
if not p:
	print(Bsdconv.error())
	del p
	sys.exit()
p.init()
s=sys.stdin.read(1024)
while s:
	print(p.conv_chunk(s),)
	s=sys.stdin.read(1024)

print(p.conv_chunk_last(s))
print('====================================')
print(p.counter())
del p
Beispiel #11
0
# -*- coding: utf-8 -*-
# python nfkc_gen.py '⁰¹²³'|sort|uniq
import sys
from bsdconv import Bsdconv

nfkc = Bsdconv("utf-8:nfkc:utf-8")
i = sys.argv[1].decode("utf-8")
for c in i:
	c = c.encode("utf-8")
	d = nfkc.conv(c)
	if c==d:
		continue
	print("{}\t{}".format(d, c))
Beispiel #12
0
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p = Bsdconv(sys.argv[1])
if not p:
    print(Bsdconv.error())
    del p
    sys.exit()
p.init()
s = sys.stdin.read(1024)
while s:
    print(p.conv_chunk(s), )
    s = sys.stdin.read(1024)

print(p.conv_chunk_last(s))
print('====================================')
print(p.counter())
del p
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p=Bsdconv(sys.argv[1])
if not p:
	print(Bsdconv.error())
	del p
	sys.exit()
p.conv_file(sys.argv[2], sys.argv[3])
print(p)
print(p.counter())
del p
Beispiel #14
0
#!/usr/bin/env python
import os
import sys
from bsdconv import Bsdconv

a=Bsdconv.mktemp("score.XXXXXX")
os.unlink(a[1])
clist=Bsdconv.fopen("characters_list.txt","w+")

p=Bsdconv("utf-8:score-train:null")
if not p:
	print(Bsdconv.error())
	del p
	sys.exit()

p.ctl(Bsdconv.CTL_ATTACH_SCORE, a[0], 0)
p.ctl(Bsdconv.CTL_ATTACH_OUTPUT_FILE, clist, 0)

p.init()
f=open(sys.argv[1])
s=f.read(1024)
while s:
	p.conv_chunk(s),
	s=f.read(1024)

p.conv_chunk_last(s)
f.close()
#!/usr/bin/env python
from bsdconv import Bsdconv

sin="utf-8:utf-8,ascii"
sout=Bsdconv.insert_phase(sin, "upper", Bsdconv.INTER, 1)
print(sout)

sin=sout
sout=Bsdconv.replace_phase(sin, "full", Bsdconv.INTER, 1)
print(sout)

sin=sout
sout=Bsdconv.replace_codec(sin, "big5", 2, 1)
print(sout)

sin=sout
sout=Bsdconv.insert_codec(sin, "ascii", 0, 1)
print(sout)
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

p=Bsdconv(sys.argv[1])
if not p:
	print(Bsdconv.error())
	del p
	sys.exit()
s=sys.stdin.read()
print(p.conv(s))
print('====================================')
print(p.counter())
del p
Beispiel #17
0
#!/usr/bin/env python
from bsdconv import Bsdconv

sin = "utf-8:utf-8,ascii"
sout = Bsdconv.insert_phase(sin, "upper", Bsdconv.INTER, 1)
print(sout)

sin = sout
sout = Bsdconv.replace_phase(sin, "full", Bsdconv.INTER, 1)
print(sout)

sin = sout
sout = Bsdconv.replace_codec(sin, "big5", 2, 1)
print(sout)

sin = sout
sout = Bsdconv.insert_codec(sin, "ascii", 0, 1)
print(sout)
Beispiel #18
0
#!/usr/bin/env python
import os
import sys
from bsdconv import Bsdconv

a = Bsdconv.mktemp("score.XXXXXX")
os.unlink(a[1])
clist = Bsdconv.fopen("characters_list.txt", "w+")

p = Bsdconv("utf-8:score-train:null")
if not p:
    print(Bsdconv.error())
    del p
    sys.exit()

p.ctl(Bsdconv.CTL_ATTACH_SCORE, a[0], 0)
p.ctl(Bsdconv.CTL_ATTACH_OUTPUT_FILE, clist, 0)

p.init()
f = open(sys.argv[1])
s = f.read(1024)
while s:
    p.conv_chunk(s),
    s = f.read(1024)

p.conv_chunk_last(s)
f.close()
Beispiel #19
0
import sys
import re
import itertools
from bsdconv import Bsdconv
from pyquery import PyQuery as pq

def bsdconv01(dt):
	dt=dt.strip().lstrip("0").upper()
	if len(dt) & 1:
		return "010"+dt
	else:
		return "01"+dt

bcv = Bsdconv("UTF-8:INSERT#AFTER=002C:BSDCONV-KEYWORD,BSDCONV")
bcv1252 = Bsdconv("CP1252:INSERT#AFTER=002C:BSDCONV-KEYWORD,BSDCONV")

d = pq(open(sys.argv[1]).read())

fonttype = {}
fontdata = {}

for font in d.find("font"):
	font = pq(font)
	ftype = font.attr("type")
	fonttype[ftype] = {"hidden": font.attr("hidden"), "inherit": font.attr("inherit")}
	for m in itertools.chain(font.find("global").find("map"), font.find("tounicode").find("map")):
		m = pq(m)
		if ftype not in fontdata:
			fontdata[ftype] = {}
		fontdata[ftype][m.attr("legacy")]=m.attr("unicode")
Beispiel #20
0
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

print(Bsdconv.module_check(Bsdconv.FROM, "_utf-8"))
print(Bsdconv.module_check(Bsdconv.INTER, "_utf-8"))
print('Filter:')
print(Bsdconv.modules_list(Bsdconv.FILTER))
print('From:')
print(Bsdconv.modules_list(Bsdconv.FROM))
print('Inter:')
print(Bsdconv.modules_list(Bsdconv.INTER))
print('To:')
print(Bsdconv.modules_list(Bsdconv.TO))
#!/usr/bin/env python
import sys
from bsdconv import Bsdconv

print(Bsdconv.module_check(Bsdconv.FROM,"_utf-8"))
print(Bsdconv.module_check(Bsdconv.INTER,"_utf-8"))
print('Filter:')
print(Bsdconv.modules_list(Bsdconv.FILTER))
print('From:')
print(Bsdconv.modules_list(Bsdconv.FROM))
print('Inter:')
print(Bsdconv.modules_list(Bsdconv.INTER))
print('To:')
print(Bsdconv.modules_list(Bsdconv.TO))

Beispiel #22
0
async def decomp_handler(request):
    c = Bsdconv("utf-8:zh-decomp:split:bsdconv-keyword,bsdconv")
    a = c.conv(request.GET["q"]).decode("utf-8").strip(",").split(",")
    return web.json_response(a)
Beispiel #23
0
# -*- coding: utf-8 -*-
# python nfkc_gen.py '⁰¹²³'|sort|uniq
import sys
from bsdconv import Bsdconv

nfkc = Bsdconv("utf-8:nfkc:utf-8")
i = sys.argv[1].decode("utf-8")
for c in i:
    c = c.encode("utf-8")
    d = nfkc.conv(c)
    if c == d:
        continue
    print("{}\t{}".format(d, c))
Beispiel #24
0
async def info_handler(request):
    cv = Bsdconv("utf-8:cns11643:bsdconv")
    us = list(request.GET["q"])
    cs = [cv.conv(x).decode("utf-8") for x in us]
    r = [{"u":u,"p":c[2:4].lstrip('0'),"c":c[4:]} for u,c in zip(us, cs)]
    return web.json_response(r)
Beispiel #25
0
	["_utf-8#overlong,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc1\xbf,\xe0\x9f\xbf,\xf0\x8f\xbf\xbf,\xf8\x87\xbf\xbf\xbf,\xfc\x83\xbf\xbf\xbf\xbf", "017F,012C,0107FF,012C,01FFFF,012C,013F,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,013F,"],
	["_utf-8,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc0\x80,\xe0\x80\x80,\xf0\x80\x80\x80,\xf8\x80\x80\x80\x80,\xfc\x80\x80\x80\x80\x80", "013F,013F,012C,013F,013F,013F,012C,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,012C,013F,013F,013F,013F,013F,013F,"],
	["_utf-8#nul&overlong&super,ascii,3f:insert#after=002c:bsdconv-keyword,bsdconv", "\xc0\x80,\xe0\x80\x80,\xf0\x80\x80\x80,\xf8\x80\x80\x80\x80,\xfc\x80\x80\x80\x80\x80", "0100,012C,0100,012C,0100,012C,0100,012C,0100,"],
]

countertest=[
	["utf-8:width:null","123Б測試",{"FULL":2,"AMBI":1,"HALF":3}],
	["utf-8:count:null","123Б測試",{"COUNT":6}],
	["utf-8:count#blah:null","123Б測試",{"BLAH":6}],
	["utf-8:count#for=lala&for=cjk:null","123Б測a試bc",{"COUNT":2}],
]

passed=True

for c, i, o in iotest:
	p=Bsdconv(c)
	if not p:
		print(Bsdconv.error())
		print("Test failed at %s" % repr([c, i, o]))
		del p
		passed=False
		continue
	r=p.conv(i)
	if o != r:
		print("Test failed at %s" % repr([c, i, o]))
		print("expected(%d): %s" % (len(o), repr(o)))
		print("result(%d): %s" % (len(r), repr(r)))
		passed=False
	del p

for c, d, i in countertest:
Beispiel #26
0
#!/usr/bin/env python

#mkbonus.py src_list char_list phrase_list

import sys
import re
from bsdconv import Bsdconv

clist=open(sys.argv[2], "w")
plist=open(sys.argv[3], "w")

sc=Bsdconv("utf-8:score#default:null")
bcv=Bsdconv("utf-8:insert#after=002c:bsdconv-keyword,bsdconv")
bcv_zhtw=Bsdconv("utf-8:zhtw:insert#after=002c:bsdconv-keyword,bsdconv")

sep=re.compile(r"\s+")

f=open(sys.argv[1])
for l in f:
	l = l.strip()
	if l == "":
		continue
	if l.startswith("#"):
		clist.write(l+"\n")
		plist.write(l+"\n")
	a = sep.split(l)
	p = a[0]
	ln = len(p.decode("utf-8"))
	if ln > 1:
		bonus = 6
		p = bcv_zhtw.conv(p).rstrip(",")
Beispiel #27
0
class Crawler(object):
    convert = Bsdconv("ansi-control,byte:big5-defrag:byte,ansi-control|skip,big5:utf-8,bsdconv_raw")

    def __init__(self, host):
        self.host = host
        self.delay = 0
        self.conn = Telnet(host, 3456)
        self.screen = pyte.Screen(80, 24)
        self.stream = pyte.Stream()
        self.screen.mode.discard(pyte.modes.LNM)
        self.stream.attach(self.screen)
        self.display
        self.login()
        self.enter_board('NCTU-Teacher')
        for i in range(input('n - ' + self.last_id + ': '), int(self.last_id) + 1):
            self.get_article(i)

    @property
    def display(self):
        s = self.conn.read_very_eager()
        while not s:
            s = self.conn.read_very_eager()
            time.sleep(self.delay)           
        s = self.convert.conv(s)
        self.stream.feed(s.decode('utf-8'))
        return self.screen_shot

    @property
    def screen_shot(self):
        return "\n".join(self.screen.display).encode("utf-8")

    def close(self):
        self.conn.close()

    def send_enter(self, count=1):
        for i in range(count):
            s = self.send('\r')
            if count == 1:
                return s
    
    def send(self, s):
        self.conn.write(s)
        ret = self.display
        return ret

    def login(self):
        username = '******'
        self.conn.write(username + '\r')
        self.conn.write('\rYY\r')
        self.send_enter(2)

    def enter_board(self, board):
        '''
        Save current board name in self.board
        and lastest article_id in self.last_id
        '''
        self.send('OBOC')
        self.send('s{}'.format(board))
        self.send_enter(2)
        line = self.screen.cursor.y
        self.last_id = re.search(r'(?P<last_id>^\d+) ', self.screen.display[line].strip()).group()
        self.board = board

    def get_article(self, num=None):
        if not num:
            return

        self.send('{}\rOC'.format(num))
        raw_artcle = self.screen.display[:-1]

        status_line = self.screen.display[-1]
        if status_line.find('[Y/n]') != -1:
            self.send('n')
        while status_line.find('(100%)') == -1:
            self.send('OB')
            status_line = self.screen.display[-1]
            raw_artcle.append(self.screen.display[-2])
        self.save_article(num, raw_artcle)

    def term_comm(feed=None, wait=None):
        if feed != None:
            self.conn.write(feed)
            if wait:
                s = self.conn.read_some()
                s = self.convert.conv_chunk(s)
                self.stream.feed(s.decode("utf-8"))
        if wait != False:
            time.sleep(0.1)
            s = self.conn.read_very_eager()
            s = self.convert.conv_chunk(s)
            self.stream.feed(s.decode("utf-8"))
        ret = "\n".join(self.screen.display).encode("utf-8")
        return ret

    def save_article(self, num, content):
        '''
        :param content: a list get from screen
        '''
        chinese_keyword = {
            'board': '看板',
        }

        author_line = content[0].encode('utf-8').split()
        if not chinese_keyword['board'] in author_line:
            return
        _i = author_line.index(chinese_keyword['board'])
        author = ' '.join(author_line[1:_i])

        title_line = content[1].encode('utf-8').split()[1:]
        title = ' '.join(title_line)

        time_line = content[2].encode('utf-8').split()[1:]
        time = ' '.join(time_line)
        if not time.find('(') == -1:
            time = time[time.find('(') + 1:time.find(')')]
        time = time.split()
        time.pop(1)
        time = ' '.join(time)
        print time


        article = '\n'.join(content[3:]).encode('utf-8')

        try:
            post = Teacher.get(bbs_id=num)
            post.content = article
            post.save()
            logger.info('Update: {id}'.format(id=num))
        except Teacher.DoesNotExist:
            post = Teacher.create(author=author,
                title=title,
                pub_time=time,
                content=article,
                bbs_id=num
            )
            logger.info('Insert: {id}'.format(id=num))