def read_atf(data_path): 'Read and segment the atf data export.' filename = os.path.join(data_path, 'cdliatf_unblocked.atf') fp = io.open(filename, encoding='utf-8') for atf in atf2cts.segmentor(fp): # Parse out the CDLI id code. if atf.startswith('&P'): # Drop the '&' sigil and any trailing garbage. cdli_id = atf[1:8] elif atf.startswith('&'): # Handle broken entries with whitespace around the id. token = atf.split()[1] cdli_id = token[0:7] print('Warning: whitespace at the start of &-line.') else: cdli_id = '' # Check if we found what looks like a cdli id. if not cdli_id.startswith('P') or not cdli_id[1:].isdecimal(): print("Error: ATF record doesn't start with a CDLI id!") print(atf.splitlines()[0]) continue # Parse out the language header, if any. language = None for line in atf.splitlines(): if line.startswith('#atf') and 'lang' in line: part = line.split('lang') # Skip spurious equal signs. # These are invalid syntax, but occur sometimes. if part[1].strip() == '=': del part[1] language = part[1].strip() break yield (cdli_id, language, atf)
def test_segmentor(count): '''Verify segmentation of multiple blocks. Concatenate the test file with itself and check we get back the name number of copies.''' with io.open(test_filename, encoding='utf-8') as f: text = f.read() assert text multi = repeat(text, count) multi = '\n\n'.join(multi) multi = io.StringIO(multi) assert (len(list(atf2cts.segmentor(multi)))) == count
def test_segmentor_single(): '''Verify segmentation of a test file.''' with io.open(test_filename, encoding='utf-8') as f: assert len(list(atf2cts.segmentor(f))) == 1