def convert_dir(src_path, dst_path): """TODO: Docstring for convert_dir. :src_path: TODO :dst_path: TODO :returns: TODO """ src_map = {} dir_map = {} dir_counter = Counter() path_counter = 0 for i, src in enumerate(walk_dir(src_path)): _, src_ext = os.path.splitext(src) if not src_ext in ['.doc', '.docx']: log.info('Skipping unsupported file "%s"', src) continue log.info('Converting to plaintext, file %s', i+1) next_src_dir = os.path.dirname(src) if next_src_dir in dir_map: next_dst_path = dir_map[next_src_dir] else: next_dst_path = get_counter_dir(dst_path, path_counter) os.makedirs(next_dst_path) path_counter += 1 dir_map[next_src_dir] = next_dst_path dst_name = str(dir_counter[next_dst_path]) dst = os.path.join(next_dst_path, dst_name) dir_counter[next_dst_path] += 1 to_text(src, dst) src_map[src] = dst return src_map
def keep_originals(src_map, path): """TODO: Docstring for keep_originals. :src_map: TODO :path: TODO :returns: TODO """ path_map = dict((d, s) for s, d in src_map.items()) for i, next_path in enumerate(walk_dir(path)): log.info('Copy original %s for "%s"', i+1, next_path) dst_path = FilePath.from_path(next_path) src_path = FilePath.from_path(path_map[next_path]) orig_path = '{}__original__{}'.format(dst_path.path, src_path.ext) shutil.copyfile(src_path.path, orig_path)
def convert_dir_in_place(detokenizer_script, path): """TODO: Docstring for convert_dir_in_place. :detokenizer_script: TODO :path: TODO :returns: TODO """ for i, next_path in enumerate(walk_dir(path)): log.info('Detokenizing file %s, "%s"', i+1, next_path) with open(next_path, 'rb') as f: detokenized_data = subprocess.check_output([ 'perl', detokenizer_script ], stdin=f) with open(next_path, 'wb') as f: f.write(detokenized_data)
def convert_dir_in_place(path): """TODO: Docstring for convert_dir_in_place. :path: TODO :returns: TODO """ src_map = {} done = set() for i, src in enumerate(walk_dir(path)): if src in done: continue log.info('Detecting language of file number %s "%s"', i+1, src) with open(src) as f: text = f.read() lang = langdetect.detect(text) new_name = src + '.' + lang os.rename(src, new_name) done.add(src) done.add(new_name) src_map[src] = new_name return src_map