def test_json_serialization(name): """ Parses ATF and verifies the to_json() method output. """ afile = AtfFile(sample_file(name)) js = afile.to_json() result = json.loads(js) assert result noskipjs = afile.to_json(skip_empty=False, sort_keys=True) result = json.loads(noskipjs) assert result assert len(noskipjs) >= len(js)
def consider_file(name, code, description): """ Parses ATF and checks CDLI ID and text description coincide """ afile = AtfFile(sample_file(name)) assert afile.text.code == code assert afile.text.description == description
def test_create(): """ Parse belsunu.atf and check &-line was parsed correctly """ afile = AtfFile(belsunu()) assert afile.text.code == "X001001" assert afile.text.description == "JCS 48, 089"
def test_text_designation(name, code, description): """ Parses ATF and checks CDLI ID and text description coincide. """ afile = AtfFile(sample_file(name)) assert afile.text.code == code assert afile.text.description == description
def test_composite(): """ Parse anzu.atf (composite sample) and check separate text elements were parsed correctly """ afile = AtfFile(anzu()) assert afile.text.texts[0].code == "X002001" assert afile.text.texts[0].description == "SB Anzu 1" assert afile.text.texts[1].code == "Q002770" assert afile.text.texts[1].description == "SB Anzu 2"
def test_line_words(): """ Get a sample line of words with unicode chars and test serialization. 1. [MU] 1.03-KAM {iti}AB GE₆ U₄ 2-KAM """ atf_file = AtfFile(belsunu()) uline = atf_file.text.children[0].children[0].children[0] uwords = uline.words gold = [ u'[MU]', u'1.03-KAM', u'{iti}AB', u'GE\u2086', u'U\u2084', u'2-KAM' ] assert uwords == gold
def test_line_lemmas(): """ Get a sample line of lemmas with unicode chars and test serialization. šatti[year]N; n; Ṭebetu[1]MN; mūša[at night]AV; ūm[day]N; n """ atf_file = AtfFile(belsunu()) uline = atf_file.text.children[0].children[0].children[0] ulemmas = uline.lemmas gold = [ u' \u0161atti[year]N', u'n', u'\u1e6cebetu[1]MN', u'm\u016b\u0161a[at night]AV', u'\u016bm[day]N', u'n' ] assert ulemmas == gold
def __init__(self, **kwargs): self.texts = [] self.failures = 0 self.successes = 0 self.atftype = kwargs['atftype'] self.source = kwargs['source'] if 'source' in kwargs: for dirpath, _, files in os.walk(self.source): for file in files: if file.endswith('.atf'): try: path = os.path.join(dirpath, file) print("Parsing file", path, "... ", end="") content = codecs.open(path, encoding='utf-8-sig').read() self.texts.append(AtfFile(content, self.atftype)) self.successes += 1 print("OK") except (SyntaxError, IndexError, AttributeError, UnicodeDecodeError) as e: self.texts.append(None) self.failures += 1 print("Failed with message: '{}'".format(e))
def convert(atf_text): """ Create a TEI representation of a file-like object containing ATF. """ # Parse the ATF input string. atf = AtfFile(atf_text, 'cdli', False) if verbose: print("Parsed {} -- {}".format(atf.text.code, atf.text.description)) # Construct a TEI Document to hold the converted text. doc = tei.Document() doc.language = atf.text.language doc.header = tei.Header() doc.header.title = atf.text.description doc.header.cdli_code = atf.text.code # Traverse the parse tree, recording lines under labels. translations = {} objects = [ item for item in atf.text.children if isinstance(item, OraccObject) ] edition = tei.Edition() doc.parts.append(edition) for item in objects: part = tei.TextPart(item.objecttype) edition.append(part) for section in item.children: if isinstance(section, OraccObject): try: name = section.name except AttributeError: name = section.objecttype div = tei.TextPart(name) part.append(div) elif isinstance(section, Translation): # Handle in another pass. continue else: print('Skipping unknown section type', type(section).__name__) continue for obj in section.children: if isinstance(obj, Line): text = normalize_transliteration(obj.words) line = tei.Line(obj.label, text) div.append(line) # Older pyoracc parses interlinear translatsions # as notes. Remember them for serialization below. for note in obj.notes: if note.content.startswith('tr.'): lang, text = note.content.split(':', maxsplit=1) _, lang = lang.split('.') # tr.ts is used for normalization, so mark # this with the primary object's language. if lang == 'ts': lang == atf.text.language tr_line = Line(obj.label) tr_line.words = text.strip().split() if lang not in translations: translations[lang] = [] translations[lang].append(tr_line) elif isinstance(obj, State) or isinstance(obj, Ruling): text = str(obj).strip() # Strip the initial '$' off the ATF representation. text = text[1:].strip() div.append(tei.Note(text)) else: print('Skipping unknown section child type', type(obj).__name__) continue # Add accumulated interlinear translations to the document. for lang, tr_lines in translations.items(): translation = tei.Translation() translation.language = lang doc.parts.append(translation) for tr_line in tr_lines: text = ' '.join(tr_line.words) line = tei.Line(tr_line.label, text) translation.append(line) # Traverse the tree again, recording any parallel translation sections. # pyoracc only supports these for English. translation = tei.Translation() translation.language = 'eng' translation_empty = True for item in objects: part = tei.TextPart(item.objecttype) translation.append(part) for section in item.children: # Skip anything which is not a translation for this pass. if not isinstance(section, Translation): continue for surface in section.children: if isinstance(surface, OraccObject): div = tei.TextPart(surface.objecttype) part.append(div) for obj in surface.children: if isinstance(obj, Line): text = ' '.join(obj.words) line = tei.Line(obj.label, text) div.append(line) translation_empty = False else: print('Skipping unknown section child type', {type(obj).__name__}) continue if not translation_empty: doc.parts.append(translation) return doc
def convert(atf_text): """ Create a TEI representation of a file-like object containing ATF. """ atf = AtfFile(atf_text, 'cdli', False) if verbose: print("Parsed {} -- {}".format(atf.text.code, atf.text.description)) result = '''<?xml version="1.0" encoding="UTF-8"?> <TEI xmlns="http://www.tei-c.org/ns/1.0"> <teiHeader> <fileDesc> <titleStmt> <title>{description}</title> </titleStmt> <publicationStmt> <p>Converted from ATF by atf2tei.</p> </publicationStmt> <sourceDesc> <idno type="CDLI">{code}</idno> </sourceDesc> </fileDesc> <encodingDesc> <refsDecl n="CTS"> <cRefPattern n="line" matchPattern="(\\w+)\\.(\\w+)\\.(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\']/tei:l[@n=\'$3\'])"> <p>This pointer pattern extracts a specific line.</p> </cRefPattern> <cRefPattern n="surface" matchPattern="(\\w+)\\.(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\'])"> <p>This pointer pattern extracts an inscribed surface.</p> </cRefPattern> <cRefPattern n="object" matchPattern="(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])"> <p>This pointer pattern extracts a specific artefact, usually a tablet.</p> </cRefPattern> </refsDecl> </encodingDesc> </teiHeader> '''.format(description=escape(atf.text.description), code=escape(atf.text.code)) urn = f'urn:cts:cdli:test.{atf.text.code}' result += f'<text n="{urn}"' if atf.text.language: result += f' xml:lang="{atf.text.language}"' result += '>\n' result += '<body>\n' translations = {} objects = [item for item in atf.text.children if isinstance(item, OraccObject)] result += ''' <div type="edition">\n''' for item in objects: result += f' <div type="textpart" n="{item.objecttype}">\n' for section in item.children: if isinstance(section, OraccObject): result += ' <div type="textpart"' \ f' n="{section.objecttype}">\n' elif isinstance(section, Translation): # Handle in another pass. continue else: result += ' <div>\n' \ f'<!-- {type(section).__name__}: {section} -->\n' for line in section.children: if isinstance(line, Line): text = normalize_transliteration(line.words) result += f' <l n="{line.label}">{text}</l>\n' # Older pyoracc parses interlinear translatsions # as notes. Remember them for serialization below. for note in line.notes: if note.content.startswith('tr.'): lang, text = note.content.split(':', maxsplit=1) _, lang = lang.split('.') # tr.ts is used for normalization, so mark # this with the primary object's language. if lang == 'ts': lang == atf.text.language tr_line = Line(line.label) tr_line.words = text.strip().split() if lang not in translations: translations[lang] = [] translations[lang].append(tr_line) else: result += f' <!-- {type(line).__name__}: {line} -->\n' result += ' </div>\n' result += ' </div>\n' result += ' </div>\n' objects = [item for item in atf.text.children if isinstance(item, OraccObject)] result += ' <div type="translation">\n' for item in objects: result += f' <div type="textpart" n="{item.objecttype}">\n' for section in item.children: # Skip anything which is not a translation for this pass. if not isinstance(section, Translation): continue for surface in section.children: result += f' <div type="textpart" ' \ f'n="{surface.objecttype}">\n' if isinstance(surface, OraccObject): for line in surface.children: if isinstance(line, Line): text = ' '.join(line.words) result += ' ' \ f'<l n="{line.label}">{text}</l>\n' else: result += ' <!-- ' \ f'{type(line).__name__}: {line} -->\n' result += ' </div>\n' result += ' </div>\n' result += ' </div>\n' for lang, translation in translations.items(): result += f' <div type="translation" xml:lang="{lang}">\n' for line in translation: text = ' '.join(line.words) result += f' <l n="{line.label}">{escape(text)}</l>\n' result += ' </div>\n' result += ''' </body> </text> </TEI>''' return result
def consider_composite(name, code): """ Parses ATF and checks CDLI ID coincides """ afile = AtfFile(sample_file(name)) assert afile.text.texts[0].code == code
def parse(any_str): """ Parse input string, could be just a line or a whole file content. """ parsed = AtfFile(any_str) return parsed