Example #1
0
        l2 = len(tgt_line)

        if l1 > l2:
            l1, l2 = l2, l1

        assert l1 <= l2

        if l1 < args.min:
            continue

        if args.relative:
            rel = l2/l1
            if rel <= args.d:
                continue
        else:
            if l2-l1 <= args.d:
                continue

        lines.append(linenr)

        if args.v > 0:
            sys.stdout.write("line: %s\n" %(linenr))
        if args.v > 1:
            sys.stdout.write(" src: %s\n" %(" ".join(src_line)))
            sys.stdout.write(" tgt: %s\n" %(" ".join(tgt_line)))

    sys.stdout.write("found %s lines\n" %(len(lines)))

    if args.write:
        write_numbers(lines, args.write)
Example #2
0
    in_stream = codecs.getreader("utf-8")(sys.stdin)  # read from stdin
    out_stream = codecs.getwriter("utf-8")(sys.stdout)  # write to stdout

    re_whitespace = re.compile("\s+")

    strange_lines = []
    for linenr, line in enumerate(in_stream):
        line = line.strip()
        if args.no_ws:
            line = re_whitespace.sub(line, " ")
        # strange_chars = set(line) - chars
        strange_chars = [c for c in line if not c in chars]
        if len(strange_chars) > 3:
            #print strange_chars
            if args.verbose:
                out_stream.write(u"line %s offending characters:" % (linenr))
                if args.uniq:
                    strange_chars = set(strange_chars)
                for c in strange_chars:
                    out_stream.write(u" %s (%s)" % (c, repr(c)))
                out_stream.write(u"\n")
                out_stream.write(line + u"\n")
            strange_lines.append(linenr)

            # print u" ".join(list(linenr, set(line) - chars), " orig:", line

    sys.stdout.write("found %s lines\n" % (len(strange_lines)))
    if args.write:
        write_numbers(strange_lines, args.write)
#    chars.update(set(u""))

    # quotation
    chars.update(set(u"“”"))

    in_stream = codecs.getreader("utf-8")(sys.stdin)   # read from stdin
    out_stream = codecs.getwriter("utf-8")(sys.stdout) # write to stdout


    strange_lines = []
    for linenr, line in enumerate(in_stream):
        line = line.strip()
        # strange_chars = set(line) - chars
        strange_chars = [c for c in line if not c in chars]
        normal_chars = [c for c in line if c in chars]
        if len(strange_chars) > args.n:
            if args.verbose:
                out_stream.write(u"line %s: %s offending characters: |%s|\n"
                                 %(linenr, len(strange_chars),u"|".join(strange_chars)))
                out_stream.write(line + u"\n")
            strange_lines.append(linenr)
            # print u" ".join(list(linenr, set(line) - chars), " orig:", line

        out_stream.write("%s\n" %(u"".join(normal_chars)))
        out_stream.flush()


#    sys.stdout.write("found %s lines\n" %(len(strange_lines)))
    if args.write:
        write_numbers(strange_lines, args.write)