def select(in_dir, out_dir): gs_files = sorted(glob.glob('{}{}*.gs.txt'.format(in_dir, os.sep))) gs_files = [cwl_file(os.path.abspath(f)) for f in gs_files] ocr_files = sorted(glob.glob('{}{}*.ocr.txt'.format(in_dir, os.sep))) ocr_files = [cwl_file(os.path.abspath(f)) for f in ocr_files] stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'ocr_files': ocr_files, 'gs_files': gs_files}))
def ls_chunk(in_dir, chunks, name): div = json.load(chunks) files = div.get(name, []) files_out = [cwl_file(os.path.abspath(os.path.join(in_dir, f))) for f in files] stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'out_files': files_out}))
def command(in_dir, datadivision, name, out_dir): create_dirs(out_dir) div = json.load(datadivision) files_out = [cwl_file(f) for f in get_files(in_dir, div, name)] stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'out_files': files_out}))
def ls_chunk(in_dir, chunks, name): div = json.load(chunks) files = div.get(name, []) files_out = [ cwl_file(os.path.abspath(os.path.join(in_dir, f))) for f in files ] stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'out_files': files_out}))
def remove_empty_files(in_dir, out_dir): ocr_out = [] gs_out = [] ocr_files = glob.glob('{}{}*.ocr.txt'.format(in_dir, os.sep)) gs_files = glob.glob('{}{}*.gs.txt'.format(in_dir, os.sep)) for ocr, gs in zip(sorted(ocr_files), sorted(gs_files)): with codecs.open(ocr, 'r', encoding='utf-8') as f: ocr_text = f.read() with codecs.open(gs, 'r', encoding='utf-8') as f: gs_text = f.read() if len(ocr_text) > 0 and len(gs_text) > 0: ocr_out.append(cwl_file(ocr)) gs_out.append(cwl_file(gs)) stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'ocr': ocr_out, 'gs': gs_out}))
def command(dir_in): files_out = [] newspapers = ['ad1951', 'nrc1950', 't1950', 'tr1950', 'vk1951'] for np in newspapers: path = os.path.join(dir_in, np) for f in os.listdir(path): fi = os.path.join(path, f) if fi.endswith('.folia.xml'): files_out.append(cwl_file(fi)) stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps({'out_files': files_out}))
def sac2gs_and_ocr(in_dir, out_dir): result = {} result['gs_de'] = [] result['ocr_de'] = [] result['gs_fr'] = [] result['ocr_fr'] = [] files = {} for i in range(1864, 1900): try: in_files = get_files(os.path.join(in_dir, str(i))) for fi in in_files: language = 'de' typ = 'gs' bn = os.path.basename(fi) if bn.endswith('ocr'): typ = 'ocr' if 'fr' in bn: language = 'fr' with codecs.open(fi, encoding='utf-8') as f: text = f.read() fname = '{}-{}-{}.txt'.format(i, language, typ) out_file = os.path.join(out_dir, fname) create_dirs(out_file) with codecs.open(out_file, 'a', encoding='utf-8') as fo: fo.write(text) if out_file not in files: label = '{}_{}'.format(typ, language) result[label].append(cwl_file(out_file)) files[out_file] = None except OSError: pass stdout_text = click.get_text_stream('stdout') stdout_text.write(json.dumps(result))
def test_cwl_file(): f = {'class': 'File', 'path': '/path/test.txt'} assert cwl_file('/path/test.txt') == f