Esempio n. 1
0
def select(in_dir, out_dir):
    gs_files = sorted(glob.glob('{}{}*.gs.txt'.format(in_dir, os.sep)))
    gs_files = [cwl_file(os.path.abspath(f)) for f in gs_files]

    ocr_files = sorted(glob.glob('{}{}*.ocr.txt'.format(in_dir, os.sep)))
    ocr_files = [cwl_file(os.path.abspath(f)) for f in ocr_files]

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'ocr_files': ocr_files,
                                  'gs_files': gs_files}))
Esempio n. 2
0
def ls_chunk(in_dir, chunks, name):
    div = json.load(chunks)
    files = div.get(name, [])
    files_out = [cwl_file(os.path.abspath(os.path.join(in_dir, f)))
                 for f in files]

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'out_files': files_out}))
Esempio n. 3
0
def command(in_dir, datadivision, name, out_dir):
    create_dirs(out_dir)

    div = json.load(datadivision)
    files_out = [cwl_file(f) for f in get_files(in_dir, div, name)]

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'out_files': files_out}))
Esempio n. 4
0
def ls_chunk(in_dir, chunks, name):
    div = json.load(chunks)
    files = div.get(name, [])
    files_out = [
        cwl_file(os.path.abspath(os.path.join(in_dir, f))) for f in files
    ]

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'out_files': files_out}))
Esempio n. 5
0
def remove_empty_files(in_dir, out_dir):
    ocr_out = []
    gs_out = []

    ocr_files = glob.glob('{}{}*.ocr.txt'.format(in_dir, os.sep))
    gs_files = glob.glob('{}{}*.gs.txt'.format(in_dir, os.sep))

    for ocr, gs in zip(sorted(ocr_files), sorted(gs_files)):
        with codecs.open(ocr, 'r', encoding='utf-8') as f:
            ocr_text = f.read()

        with codecs.open(gs, 'r', encoding='utf-8') as f:
            gs_text = f.read()

        if len(ocr_text) > 0 and len(gs_text) > 0:
            ocr_out.append(cwl_file(ocr))
            gs_out.append(cwl_file(gs))

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'ocr': ocr_out, 'gs': gs_out}))
Esempio n. 6
0
def command(dir_in):
    files_out = []

    newspapers = ['ad1951', 'nrc1950', 't1950', 'tr1950', 'vk1951']

    for np in newspapers:
        path = os.path.join(dir_in, np)
        for f in os.listdir(path):
            fi = os.path.join(path, f)
            if fi.endswith('.folia.xml'):
                files_out.append(cwl_file(fi))

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps({'out_files': files_out}))
Esempio n. 7
0
def sac2gs_and_ocr(in_dir, out_dir):
    result = {}
    result['gs_de'] = []
    result['ocr_de'] = []
    result['gs_fr'] = []
    result['ocr_fr'] = []

    files = {}

    for i in range(1864, 1900):
        try:
            in_files = get_files(os.path.join(in_dir, str(i)))
            for fi in in_files:
                language = 'de'
                typ = 'gs'
                bn = os.path.basename(fi)

                if bn.endswith('ocr'):
                    typ = 'ocr'
                if 'fr' in bn:
                    language = 'fr'
                with codecs.open(fi, encoding='utf-8') as f:
                    text = f.read()
                fname = '{}-{}-{}.txt'.format(i, language, typ)
                out_file = os.path.join(out_dir, fname)
                create_dirs(out_file)
                with codecs.open(out_file, 'a', encoding='utf-8') as fo:
                    fo.write(text)
                if out_file not in files:
                    label = '{}_{}'.format(typ, language)
                    result[label].append(cwl_file(out_file))
                    files[out_file] = None
        except OSError:
            pass

    stdout_text = click.get_text_stream('stdout')
    stdout_text.write(json.dumps(result))
Esempio n. 8
0
def test_cwl_file():
    f = {'class': 'File', 'path': '/path/test.txt'}
    assert cwl_file('/path/test.txt') == f