Beispiel #1
0
def convert_to_utf8(src, output=None, override=False):
    src = upath.abspath(src)
    print(src)
    filename = os.path.basename(src)
    output = upath.abspath(output) if output else None
    dir_ = upath.abspath(os.path.dirname(src))
    if override:
        dst = src
    else:
        if dir_ == output:
            dst = os.path.join(output, filename)
        else:
            dst = os.path.join(output or dir_, '_%s' % filename)
    if not os.path.exists(os.path.dirname(dst)):
        os.makedirs(os.path.dirname(dst))

    with open(src, 'rb') as fr:
        data = fr.read()
        ct = chardet.detect(data)
        if ct['confidence'] > 0.90:
            encoding = ct['encoding']
        else:
            encoding = UnicodeDammit(data).original_encoding
        encoding = fix_encoding(encoding)
        utf8_data = to_text(data, encoding=encoding)
        print('convert_to_utf8() encoding:%s file:%s' % (encoding, src))
        with codecs.open(dst, 'w', 'utf-8') as fw:
            fw.write(utf8_data)
Beispiel #2
0
def convert_to_utf8(src, output=None, override=False):
    src = upath.abspath(src)
    print(src)
    filename = os.path.basename(src)
    output = upath.abspath(output) if output else None
    dir_ = upath.abspath(os.path.dirname(src))
    if override:
        dst = src
    else:
        if dir_ == output:
            dst = os.path.join(output, filename)
        else:
            dst = os.path.join(output or dir_, '_%s' % filename)
    if not os.path.exists(os.path.dirname(dst)):
        os.makedirs(os.path.dirname(dst))

    with open(src, 'rb') as fr:
        data = fr.read()
        ct = chardet.detect(data)
        if ct['confidence'] > 0.90:
            encoding = ct['encoding']
        else:
            encoding = UnicodeDammit(data).original_encoding
        encoding = fix_encoding(encoding)
        utf8_data = to_text(data, encoding=encoding)
        print('convert_to_utf8() encoding:%s file:%s' % (encoding, src))
        with codecs.open(dst, 'w', 'utf-8') as fw:
            fw.write(utf8_data)
Beispiel #3
0
def manual_convert():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '--convert':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../convert.log', 'w', 'utf-8')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            try:
                src = os.path.join(curdir, name)
                print('Converting %s' % src)
                with open(src, 'rb') as f:
                    data = f.read()
                dst = src
                if dry_run:
                    dst = os.path.join(curdir, '_%s' % name)
                else:
                    dst = src
                utf8_data = compat.to_text(data, encoding='gb18030')
                with codecs.open(dst, 'w', 'utf-8') as f:
                    f.write(utf8_data)
            except Exception as e:
                traceback.print_exc()
                log.write(src)
Beispiel #4
0
def manual_convert():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '--convert':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../convert.log', 'w', 'utf-8')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            try:
                src = os.path.join(curdir, name)
                print('Converting %s' % src)
                with open(src, 'rb') as f:
                    data = f.read()
                dst = src
                if dry_run:
                    dst = os.path.join(curdir, '_%s' % name)
                else:
                    dst = src
                utf8_data = compat.to_text(data, encoding='gb18030')
                with codecs.open(dst, 'w', 'utf-8') as f:
                    f.write(utf8_data)
            except Exception as e:
                traceback.print_exc()
                log.write(src)
Beispiel #5
0
def first_line_as_title():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            print('Fix %s' % src)
            try:
                new_name = None
                with codecs.open(src, 'r', 'utf-8') as f:
                    new_name = f.readline().strip()
                    while not new_name or len(new_name) > 25:
                        new_name = f.readline().strip()
                new_name = new_name.replace('标  题: ', '')
                if new_name and new_name != name:
                    print('New name: %s' % new_name)
                    if not dry_run:
                        dst = os.path.join(curdir, '%s.txt' % new_name)
                        shutil.move(src, dst)
                        print('Rename to: %s' % dst)
                else:
                    print('Skip %s' % src)
            except Exception as e:
                traceback.print_exc()
Beispiel #6
0
def first_line_as_title():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            print('Fix %s' % src)
            try:
                new_name = None
                with codecs.open(src, 'r', 'utf-8') as f:
                    new_name = f.readline().strip()
                    while not new_name or len(new_name) > 25:
                        new_name = f.readline().strip()
                new_name = new_name.replace('标  题: ', '')
                if new_name and new_name != name:
                    print('New name: %s' % new_name)
                    if not dry_run:
                        dst = os.path.join(curdir, '%s.txt' % new_name)
                        shutil.move(src, dst)
                        print('Rename to: %s' % dst)
                else:
                    print('Skip %s' % src)
            except Exception as e:
                traceback.print_exc()
Beispiel #7
0
def smart_convert():
    arg = upath.abspath(sys.argv[1])
    if upath.isfile(arg):
        convert_to_utf8(arg)
    else:
        for name in upath.listdir(arg):
            convert_to_utf8(os.path.join(arg, name), override=True)
Beispiel #8
0
def smart_convert():
    arg = upath.abspath(sys.argv[1])
    if upath.isfile(arg):
        convert_to_utf8(arg)
    else:
        for name in upath.listdir(arg):
            convert_to_utf8(os.path.join(arg, name), override=True)
Beispiel #9
0
def move_noname_files():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    dry_run = not (len(sys.argv) == 4 and sys.argv[3] == '-z')
    if not os.path.exists(output):
        os.makedirs(output)
    p = re.compile(r'[a-zA-Z0-9]+\.txt')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            if re.match(p, name):
                if not dry_run:
                    print('Moving %s %s' % (src, dst))
                    shutil.move(src, dst)
Beispiel #10
0
def move_some_files():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    dry_run = not (len(sys.argv) == 4 and sys.argv[3] == '-z')
    if not os.path.exists(output):
        os.makedirs(output)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            file_size = os.path.getsize(src)
            if file_size < 30000:
                if not dry_run:
                    print('Moving %s %s size=%s' % (src, dst, file_size))
                    shutil.move(src, dst)
Beispiel #11
0
def move_some_files():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    dry_run = not (len(sys.argv) == 4 and sys.argv[3] == '-z')
    if not os.path.exists(output):
        os.makedirs(output)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            file_size = os.path.getsize(src)
            if file_size < 30000:
                if not dry_run:
                    print('Moving %s %s size=%s' % (src, dst, file_size))
                    shutil.move(src, dst)
Beispiel #12
0
def move_noname_files():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    dry_run = not (len(sys.argv) == 4 and sys.argv[3] == '-z')
    if not os.path.exists(output):
        os.makedirs(output)
    p = re.compile(r'[a-zA-Z0-9]+\.txt')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            if re.match(p, name):
                if not dry_run:
                    print('Moving %s %s' % (src, dst))
                    shutil.move(src, dst)
Beispiel #13
0
def move_by_pattern():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    re_str = compat.to_text(sys.argv[3])
    pattern = re.compile(re_str)
    dry_run = not (len(sys.argv) == 5 and sys.argv[4] == '-z')
    # output = os.path.join(output,re_str)
    if not os.path.exists(output):
        os.makedirs(output)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            if re.search(pattern, name):
                if not dry_run:
                    print('Moving %s %s' % (src, dst))
                    shutil.move(src, dst)
                else:
                    print('Demo %s %s' % (src, dst))
Beispiel #14
0
def move_by_pattern():
    root = upath.abspath(sys.argv[1])
    output = upath.abspath(sys.argv[2])
    re_str = compat.to_text(sys.argv[3])
    pattern = re.compile(re_str)
    dry_run = not (len(sys.argv) == 5 and sys.argv[4] == '-z')
    # output = os.path.join(output,re_str)
    if not os.path.exists(output):
        os.makedirs(output)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            dst = os.path.join(output, name)
            # print('Processing %s' % src)
            if not os.path.isfile(src):
                continue
            if src == dst:
                continue
            if re.search(pattern, name):
                if not dry_run:
                    print('Moving %s %s' % (src, dst))
                    shutil.move(src, dst)
                else:
                    print('Demo %s %s' % (src, dst))
Beispiel #15
0
def remove_links():
    root = upath.abspath(sys.argv[1])
    p1 = re.compile(RE_URL)
    p2 = re.compile(r'[A-z0-9]+\.[A-z0-9]+\.[A-z0-9]+')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            _, ext = os.path.splitext(name)
            if not ext or ext.lower() != '.txt':
                continue
            with codecs.open(src, 'r', 'utf8') as f:
                content = f.read()
                content2 = re.sub(p2, '', content)
            if len(content2) < len(content):
                print('Processing %s (%s %s)' % (src, len(content), len(content2)))
                with codecs.open(src, 'w', 'utf8') as f:
                    f.write(content2)
Beispiel #16
0
def remove_invalid_chars():
    root = upath.abspath(sys.argv[1])
    pattern = re.compile('[\u4E00-\u9FA5,.]+') # 中文
    invalid = re.compile('\uE4C6+') # \xee\x93\x86 U+E4C6 方块问号
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            _, ext = os.path.splitext(name)
            if not ext or ext.lower() != '.txt':
                continue
            with codecs.open(src, 'r', 'utf8') as f:
                content = f.read()
                content2 = content.replace('\uE4C6','')
            if len(content2) < len(content):
                print('Processing %s (%s %s)' % (src, len(content), len(content2)))
                with codecs.open(src, 'w', 'utf8') as f:
                    f.write(content2)
Beispiel #17
0
def remove_invalid_chars():
    root = upath.abspath(sys.argv[1])
    pattern = re.compile('[\u4E00-\u9FA5,.]+')  # 中文
    invalid = re.compile('\uE4C6+')  # \xee\x93\x86 U+E4C6 方块问号
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            _, ext = os.path.splitext(name)
            if not ext or ext.lower() != '.txt':
                continue
            with codecs.open(src, 'r', 'utf8') as f:
                content = f.read()
                content2 = content.replace('\uE4C6', '')
            if len(content2) < len(content):
                print('Processing %s (%s %s)' %
                      (src, len(content), len(content2)))
                with codecs.open(src, 'w', 'utf8') as f:
                    f.write(content2)
Beispiel #18
0
def remove_links():
    root = upath.abspath(sys.argv[1])
    p1 = re.compile(RE_URL)
    p2 = re.compile(r'[A-z0-9]+\.[A-z0-9]+\.[A-z0-9]+')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            _, ext = os.path.splitext(name)
            if not ext or ext.lower() != '.txt':
                continue
            with codecs.open(src, 'r', 'utf8') as f:
                content = f.read()
                content2 = re.sub(p2, '', content)
            if len(content2) < len(content):
                print('Processing %s (%s %s)' %
                      (src, len(content), len(content2)))
                with codecs.open(src, 'w', 'utf8') as f:
                    f.write(content2)
Beispiel #19
0
def enca_convert():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../enca.log', 'w', 'utf-8')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            try:
                f = os.path.join(curdir, name)
                if dry_run:
                    cmd = ['enca', f]
                else:
                    cmd = 'iconv -f GB18030 -t UTF-8 -c'.split()
                    cmd.append(f)
                print(cmd)
                r = subprocess.check_output(cmd)
                r = compat.to_text(r)
                print('%s - %s' % (f, r))
            except Exception as e:
                traceback.print_exc()
                log.write(f)
Beispiel #20
0
def enca_convert():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../enca.log', 'w', 'utf-8')
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        for name in filenames:
            try:
                f = os.path.join(curdir, name)
                if dry_run:
                    cmd = ['enca', f]
                else:
                    cmd = 'iconv -f GB18030 -t UTF-8 -c'.split()
                    cmd.append(f)
                print(cmd)
                r = subprocess.check_output(cmd)
                r = compat.to_text(r)
                print('%s - %s' % (f, r))
            except Exception as e:
                traceback.print_exc()
                log.write(f)
Beispiel #21
0
def move_from_subdirs():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../fix_title.log', 'w', 'utf-8')
    print(root)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        if curdir == root:
            continue
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            dst = os.path.join(root, name)
            print('File %s -> %s' % (src, dst))
            try:
                if not dry_run:
                    shutil.move(src, dst)
                    print('Move %s -> %s' % (src, dst))
            except Exception as e:
                traceback.print_exc()
                log.write(src)
Beispiel #22
0
def move_from_subdirs():
    root = upath.abspath(sys.argv[1])
    if len(sys.argv) == 3 and sys.argv[2] == '-z':
        dry_run = False
    else:
        dry_run = True
    log = codecs.open('../fix_title.log', 'w', 'utf-8')
    print(root)
    for curdir, subdirs, filenames in os.walk(root, topdown=True):
        if curdir == root:
            continue
        for name in filenames:
            src = os.path.join(curdir, name)
            if not os.path.isfile(src):
                continue
            dst = os.path.join(root, name)
            print('File %s -> %s' % (src, dst))
            try:
                if not dry_run:
                    shutil.move(src, dst)
                    print('Move %s -> %s' % (src, dst))
            except Exception as e:
                traceback.print_exc()
                log.write(src)