コード例 #1
0
ファイル: converter.py プロジェクト: krzwolk/yalign
def convert_dir(src_path, dst_path):
    """TODO: Docstring for convert_dir.

    :src_path: TODO
    :dst_path: TODO
    :returns: TODO

    """
    src_map = {}
    dir_map = {}
    dir_counter = Counter()
    path_counter = 0
    for i, src in enumerate(walk_dir(src_path)):
        _, src_ext = os.path.splitext(src)
        if not src_ext in ['.doc', '.docx']:
            log.info('Skipping unsupported file "%s"', src)
            continue
        log.info('Converting to plaintext, file %s', i+1)
        next_src_dir = os.path.dirname(src)
        if next_src_dir in dir_map:
            next_dst_path = dir_map[next_src_dir]
        else:
            next_dst_path = get_counter_dir(dst_path, path_counter)
            os.makedirs(next_dst_path)
            path_counter += 1
            dir_map[next_src_dir] = next_dst_path
        dst_name = str(dir_counter[next_dst_path])
        dst = os.path.join(next_dst_path, dst_name)
        dir_counter[next_dst_path] += 1
        to_text(src, dst)
        src_map[src] = dst
    return src_map
コード例 #2
0
ファイル: originalkeeper.py プロジェクト: krzwolk/yalign
def keep_originals(src_map, path):
    """TODO: Docstring for keep_originals.

    :src_map: TODO
    :path: TODO
    :returns: TODO

    """
    path_map = dict((d, s) for s, d in src_map.items())
    for i, next_path in enumerate(walk_dir(path)):
        log.info('Copy original %s for "%s"', i+1, next_path)
        dst_path = FilePath.from_path(next_path)
        src_path = FilePath.from_path(path_map[next_path])
        orig_path = '{}__original__{}'.format(dst_path.path, src_path.ext)
        shutil.copyfile(src_path.path, orig_path)
コード例 #3
0
ファイル: detokenizer.py プロジェクト: krzwolk/yalign
def convert_dir_in_place(detokenizer_script, path):
    """TODO: Docstring for convert_dir_in_place.

    :detokenizer_script: TODO
    :path: TODO
    :returns: TODO

    """
    for i, next_path in enumerate(walk_dir(path)):
        log.info('Detokenizing file %s, "%s"', i+1, next_path)
        with open(next_path, 'rb') as f:
            detokenized_data = subprocess.check_output([
                'perl',
                detokenizer_script
            ], stdin=f)
        with open(next_path, 'wb') as f:
            f.write(detokenized_data)
コード例 #4
0
ファイル: langdetect.py プロジェクト: krzwolk/yalign
def convert_dir_in_place(path):
    """TODO: Docstring for convert_dir_in_place.

    :path: TODO
    :returns: TODO

    """
    src_map = {}
    done = set()
    for i, src in enumerate(walk_dir(path)):
        if src in done:
            continue
        log.info('Detecting language of file number %s "%s"', i+1, src)
        with open(src) as f:
            text = f.read()
        lang = langdetect.detect(text)
        new_name = src + '.' + lang
        os.rename(src, new_name)
        done.add(src)
        done.add(new_name)
        src_map[src] = new_name
    return src_map