Beispiel #1
0
def read_atf(data_path):
    'Read and segment the atf data export.'
    filename = os.path.join(data_path, 'cdliatf_unblocked.atf')
    fp = io.open(filename, encoding='utf-8')
    for atf in atf2cts.segmentor(fp):
        # Parse out the CDLI id code.
        if atf.startswith('&P'):
            # Drop the '&' sigil and any trailing garbage.
            cdli_id = atf[1:8]
        elif atf.startswith('&'):
            # Handle broken entries with whitespace around the id.
            token = atf.split()[1]
            cdli_id = token[0:7]
            print('Warning: whitespace at the start of &-line.')
        else:
            cdli_id = ''
        # Check if we found what looks like a cdli id.
        if not cdli_id.startswith('P') or not cdli_id[1:].isdecimal():
            print("Error: ATF record doesn't start with a CDLI id!")
            print(atf.splitlines()[0])
            continue
        # Parse out the language header, if any.
        language = None
        for line in atf.splitlines():
            if line.startswith('#atf') and 'lang' in line:
                part = line.split('lang')
                # Skip spurious equal signs.
                # These are invalid syntax, but occur sometimes.
                if part[1].strip() == '=':
                    del part[1]
                language = part[1].strip()
                break
        yield (cdli_id, language, atf)
Beispiel #2
0
def test_segmentor(count):
    '''Verify segmentation of multiple blocks.

    Concatenate the test file with itself and check we get back
    the name number of copies.'''
    with io.open(test_filename, encoding='utf-8') as f:
        text = f.read()
    assert text

    multi = repeat(text, count)
    multi = '\n\n'.join(multi)
    multi = io.StringIO(multi)
    assert (len(list(atf2cts.segmentor(multi)))) == count
Beispiel #3
0
def test_segmentor_single():
    '''Verify segmentation of a test file.'''
    with io.open(test_filename, encoding='utf-8') as f:
        assert len(list(atf2cts.segmentor(f))) == 1