예제 #1
0
파일: emt_old.py 프로젝트: emtext/emtext
def extract(raw_uni):
    parser = parse_old.Parser()
    line_list = parser.parserByDensity(raw_uni)
    lines = [line[:4] for line in line_list]
    fake_line = ['', 0, 0, 0]
    lines = [fake_line] + lines + [fake_line]
    lines = [[lines[i][1], lines[i - 1][1], lines[i + 1][1], lines[i][0]] for i in range(1, len(lines) - 1)]

    for line in lines:
        decision = check(line[:3])
        if decision > 0.4:
#            print line[0], line[-1]
            yield line[-1]
예제 #2
0
파일: emt.py 프로젝트: emtext/emtext
def extract(raw_uni):
    lines = parse.extract_text(raw_uni.encode("utf-8"))
    lines = map(lambda x: [x.bytes, len(x.text), x.text], lines)
    lines = [[line[2].strip().decode("utf-8", "ignore"), float(line[1]) / line[0], line[1], line[0]] for line in lines]
    fake_line = ["", 0, 0, 0]
    lines = [fake_line] + lines + [fake_line]
    lines = [[lines[i][1], lines[i - 1][1], lines[i + 1][1], lines[i][0]] for i in range(1, len(lines) - 1)]

    for line in lines:
        decision = check(line[:3])
        if decision > 0.4:
            #            print line[0], line[-1]
            yield line[-1]