def extract(raw_uni): parser = parse_old.Parser() line_list = parser.parserByDensity(raw_uni) lines = [line[:4] for line in line_list] fake_line = ['', 0, 0, 0] lines = [fake_line] + lines + [fake_line] lines = [[lines[i][1], lines[i - 1][1], lines[i + 1][1], lines[i][0]] for i in range(1, len(lines) - 1)] for line in lines: decision = check(line[:3]) if decision > 0.4: # print line[0], line[-1] yield line[-1]
def extract(raw_uni): lines = parse.extract_text(raw_uni.encode("utf-8")) lines = map(lambda x: [x.bytes, len(x.text), x.text], lines) lines = [[line[2].strip().decode("utf-8", "ignore"), float(line[1]) / line[0], line[1], line[0]] for line in lines] fake_line = ["", 0, 0, 0] lines = [fake_line] + lines + [fake_line] lines = [[lines[i][1], lines[i - 1][1], lines[i + 1][1], lines[i][0]] for i in range(1, len(lines) - 1)] for line in lines: decision = check(line[:3]) if decision > 0.4: # print line[0], line[-1] yield line[-1]