def test_line_word(): """ Get a sample word with unicode chars and check serialization is correct. """ line = Line("1") line.words.append(u"\u2086") line_ser = line.serialize() assert line_ser == "1.\t" + u"\u2086"
def p_translationlabelledline(self, p): """translationlabeledline : translationlabel NEWLINE | translationrangelabel NEWLINE | translationlabel CLOSER | translationrangelabel CLOSER """ p[0] = Line(p[1])
def convert(atf_text): """ Create a TEI representation of a file-like object containing ATF. """ # Parse the ATF input string. atf = AtfFile(atf_text, 'cdli', False) if verbose: print("Parsed {} -- {}".format(atf.text.code, atf.text.description)) # Construct a TEI Document to hold the converted text. doc = tei.Document() doc.language = atf.text.language doc.header = tei.Header() doc.header.title = atf.text.description doc.header.cdli_code = atf.text.code # Traverse the parse tree, recording lines under labels. translations = {} objects = [ item for item in atf.text.children if isinstance(item, OraccObject) ] edition = tei.Edition() doc.parts.append(edition) for item in objects: part = tei.TextPart(item.objecttype) edition.append(part) for section in item.children: if isinstance(section, OraccObject): try: name = section.name except AttributeError: name = section.objecttype div = tei.TextPart(name) part.append(div) elif isinstance(section, Translation): # Handle in another pass. continue else: print('Skipping unknown section type', type(section).__name__) continue for obj in section.children: if isinstance(obj, Line): text = normalize_transliteration(obj.words) line = tei.Line(obj.label, text) div.append(line) # Older pyoracc parses interlinear translatsions # as notes. Remember them for serialization below. for note in obj.notes: if note.content.startswith('tr.'): lang, text = note.content.split(':', maxsplit=1) _, lang = lang.split('.') # tr.ts is used for normalization, so mark # this with the primary object's language. if lang == 'ts': lang == atf.text.language tr_line = Line(obj.label) tr_line.words = text.strip().split() if lang not in translations: translations[lang] = [] translations[lang].append(tr_line) elif isinstance(obj, State) or isinstance(obj, Ruling): text = str(obj).strip() # Strip the initial '$' off the ATF representation. text = text[1:].strip() div.append(tei.Note(text)) else: print('Skipping unknown section child type', type(obj).__name__) continue # Add accumulated interlinear translations to the document. for lang, tr_lines in translations.items(): translation = tei.Translation() translation.language = lang doc.parts.append(translation) for tr_line in tr_lines: text = ' '.join(tr_line.words) line = tei.Line(tr_line.label, text) translation.append(line) # Traverse the tree again, recording any parallel translation sections. # pyoracc only supports these for English. translation = tei.Translation() translation.language = 'eng' translation_empty = True for item in objects: part = tei.TextPart(item.objecttype) translation.append(part) for section in item.children: # Skip anything which is not a translation for this pass. if not isinstance(section, Translation): continue for surface in section.children: if isinstance(surface, OraccObject): div = tei.TextPart(surface.objecttype) part.append(div) for obj in surface.children: if isinstance(obj, Line): text = ' '.join(obj.words) line = tei.Line(obj.label, text) div.append(line) translation_empty = False else: print('Skipping unknown section child type', {type(obj).__name__}) continue if not translation_empty: doc.parts.append(translation) return doc
def convert(atf_text): """ Create a TEI representation of a file-like object containing ATF. """ atf = AtfFile(atf_text, 'cdli', False) if verbose: print("Parsed {} -- {}".format(atf.text.code, atf.text.description)) result = '''<?xml version="1.0" encoding="UTF-8"?> <TEI xmlns="http://www.tei-c.org/ns/1.0"> <teiHeader> <fileDesc> <titleStmt> <title>{description}</title> </titleStmt> <publicationStmt> <p>Converted from ATF by atf2tei.</p> </publicationStmt> <sourceDesc> <idno type="CDLI">{code}</idno> </sourceDesc> </fileDesc> <encodingDesc> <refsDecl n="CTS"> <cRefPattern n="line" matchPattern="(\\w+)\\.(\\w+)\\.(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\']/tei:l[@n=\'$3\'])"> <p>This pointer pattern extracts a specific line.</p> </cRefPattern> <cRefPattern n="surface" matchPattern="(\\w+)\\.(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\'])"> <p>This pointer pattern extracts an inscribed surface.</p> </cRefPattern> <cRefPattern n="object" matchPattern="(\\w+)" replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])"> <p>This pointer pattern extracts a specific artefact, usually a tablet.</p> </cRefPattern> </refsDecl> </encodingDesc> </teiHeader> '''.format(description=escape(atf.text.description), code=escape(atf.text.code)) urn = f'urn:cts:cdli:test.{atf.text.code}' result += f'<text n="{urn}"' if atf.text.language: result += f' xml:lang="{atf.text.language}"' result += '>\n' result += '<body>\n' translations = {} objects = [item for item in atf.text.children if isinstance(item, OraccObject)] result += ''' <div type="edition">\n''' for item in objects: result += f' <div type="textpart" n="{item.objecttype}">\n' for section in item.children: if isinstance(section, OraccObject): result += ' <div type="textpart"' \ f' n="{section.objecttype}">\n' elif isinstance(section, Translation): # Handle in another pass. continue else: result += ' <div>\n' \ f'<!-- {type(section).__name__}: {section} -->\n' for line in section.children: if isinstance(line, Line): text = normalize_transliteration(line.words) result += f' <l n="{line.label}">{text}</l>\n' # Older pyoracc parses interlinear translatsions # as notes. Remember them for serialization below. for note in line.notes: if note.content.startswith('tr.'): lang, text = note.content.split(':', maxsplit=1) _, lang = lang.split('.') # tr.ts is used for normalization, so mark # this with the primary object's language. if lang == 'ts': lang == atf.text.language tr_line = Line(line.label) tr_line.words = text.strip().split() if lang not in translations: translations[lang] = [] translations[lang].append(tr_line) else: result += f' <!-- {type(line).__name__}: {line} -->\n' result += ' </div>\n' result += ' </div>\n' result += ' </div>\n' objects = [item for item in atf.text.children if isinstance(item, OraccObject)] result += ' <div type="translation">\n' for item in objects: result += f' <div type="textpart" n="{item.objecttype}">\n' for section in item.children: # Skip anything which is not a translation for this pass. if not isinstance(section, Translation): continue for surface in section.children: result += f' <div type="textpart" ' \ f'n="{surface.objecttype}">\n' if isinstance(surface, OraccObject): for line in surface.children: if isinstance(line, Line): text = ' '.join(line.words) result += ' ' \ f'<l n="{line.label}">{text}</l>\n' else: result += ' <!-- ' \ f'{type(line).__name__}: {line} -->\n' result += ' </div>\n' result += ' </div>\n' result += ' </div>\n' for lang, translation in translations.items(): result += f' <div type="translation" xml:lang="{lang}">\n' for line in translation: text = ' '.join(line.words) result += f' <l n="{line.label}">{escape(text)}</l>\n' result += ' </div>\n' result += ''' </body> </text> </TEI>''' return result
def p_multilingual_sequence(self, p): "multilingual_sequence : MULTILINGUAL ID " p[0] = Line(p[2][1:]) # Slice off the percent
def p_scorelabel(self, p): "line_sequence : SCORELABEL ID" p[0] = Line(p[1]) p[0].words.append(p[2])
def p_linelabel(self, p): "line_sequence : LINELABEL ID" p[0] = Line(p[1]) p[0].words.append(p[2])