def assert_token_serialization(token: Token, serialized: dict) -> None: with_common_properties = { "erasure": ErasureState.NONE.name, "value": token.value, "cleanValue": token.clean_value, "enclosureType": [type_.name for type_ in token.enclosure_type], **serialized, } assert OneOfTokenSchema().dump(token) == with_common_properties assert OneOfTokenSchema().load(with_common_properties) == token
class ControlLineSchema(LineBaseSchema): prefix = fields.String(required=True) content = fields.Function( lambda obj: [OneOfTokenSchema().dump(ValueToken.of(obj.content))], lambda value: OneOfTokenSchema().load(value, many=True), required=True, ) @post_load def make_line(self, data, **kwargs) -> ControlLine: return ControlLine(data["prefix"], " ".join(token.value for token in data["content"]))
def test_akkadian_word( word: GreekWord, expected: str, language: Language, lemmatizable: bool, alignable: bool, ) -> None: assert word.value == expected assert word.clean_value == expected.translate(str.maketrans("", "", "[]()<>#?!")) assert word.language == language assert word.normalized is False assert word.lemmatizable is lemmatizable assert word.alignable is alignable serialized = { "type": "GreekWord", "parts": OneOfTokenSchema().dump(word.parts, many=True), "uniqueLemma": [], "alignment": None, "variant": None, "lemmatizable": lemmatizable, "alignable": alignable, "normalized": word.normalized, "language": language.name, } assert_token_serialization(word, serialized)
class DollarLineSchema(LineBaseSchema): prefix = fields.Constant("$") content = fields.Function( lambda obj: [OneOfTokenSchema().dump(ValueToken.of(f" {obj.display_value}"))], lambda value: value, )
class ParallelLineSchema(LineBaseSchema): prefix = fields.Constant("//") content = fields.Function( lambda obj: [OneOfTokenSchema().dump(ValueToken.of(obj.display_value))], lambda value: value, ) display_value = fields.String(data_key="displayValue") has_cf = fields.Boolean(data_key="hasCf", required=True)
class NoteLineSchema(LineBaseSchema): prefix = fields.Constant("#note: ") content = fields.Function( lambda obj: OneOfTokenSchema().dump( [ValueToken.of(part.value) for part in obj.parts], many=True), lambda value: value, ) parts = fields.List(fields.Nested(OneOfNoteLinePartSchema), required=True) @post_load def make_line(self, data, **kwargs) -> NoteLine: return NoteLine(data["parts"])
def test_logogram( name_parts, sub_index, modifiers, flags, sign, surrogate, expected_value, expected_clean_value, expected_name, ) -> None: logogram = Logogram.of(name_parts, sub_index, modifiers, flags, sign, surrogate) expected_parts = (*name_parts, sign) if sign else name_parts assert logogram.value == expected_value assert logogram.clean_value == expected_clean_value assert ( logogram.get_key() == f"Logogram⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩" ) assert logogram.name_parts == name_parts assert logogram.name == expected_name assert logogram.modifiers == tuple(modifiers) assert logogram.flags == tuple(flags) assert logogram.lemmatizable is False assert logogram.sign == sign assert logogram.surrogate == tuple(surrogate) serialized = { "type": "Logogram", "name": expected_name, "nameParts": OneOfTokenSchema().dump(name_parts, many=True), "subIndex": sub_index, "modifiers": modifiers, "flags": [flag.value for flag in flags], "surrogate": OneOfTokenSchema().dump(surrogate, many=True), "sign": sign and OneOfTokenSchema().dump(sign), } assert_token_serialization(logogram, serialized)
def test_number( name_parts, modifiers, flags, sign, expected_value, expected_clean_value, expected_name, ) -> None: number = Number.of(name_parts, modifiers, flags, sign) expected_sub_index = 1 expected_parts = (*name_parts, sign) if sign else name_parts assert number.value == expected_value assert number.clean_value == expected_clean_value assert ( number.get_key() == f"Number⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩" ) assert number.name_parts == name_parts assert number.name == expected_name assert number.sub_index == expected_sub_index assert number.modifiers == tuple(modifiers) assert number.flags == tuple(flags) assert number.lemmatizable is False assert number.sign == sign serialized = { "type": "Number", "name": expected_name, "nameParts": OneOfTokenSchema().dump(name_parts, many=True), "modifiers": modifiers, "subIndex": expected_sub_index, "flags": [flag.value for flag in flags], "sign": sign and OneOfTokenSchema().dump(sign), } assert_token_serialization(number, serialized)
def test_reading( name_parts, sub_index, modifiers, flags, sign, expected_value, expected_clean_value, expected_name, ) -> None: reading = Reading.of(name_parts, sub_index, modifiers, flags, sign) expected_parts = (*name_parts, sign) if sign else name_parts assert reading.value == expected_value assert reading.clean_value == expected_clean_value assert ( reading.get_key() == f"Reading⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩" ) assert reading.name_parts == name_parts assert reading.name == expected_name assert reading.modifiers == tuple(modifiers) assert reading.flags == tuple(flags) assert reading.lemmatizable is False assert reading.sign == sign serialized = { "type": "Reading", "name": expected_name, "nameParts": OneOfTokenSchema().dump(name_parts, many=True), "subIndex": sub_index, "modifiers": modifiers, "flags": [flag.value for flag in flags], "sign": sign and OneOfTokenSchema().dump(sign), } assert_token_serialization(reading, serialized)
class TranslationLineSchema(LineBaseSchema): prefix = fields.String() content = fields.Function( lambda obj: [ OneOfTokenSchema().dump( ValueToken.of("".join(part.value for part in obj.parts))) ], lambda value: value, ) parts = fields.List(fields.Nested(OneOfNoteLinePartSchema), required=True) language = fields.String(required=True) extent = fields.Nested(ExtentSchema, required=True, allow_none=True) @post_load def make_line(self, data, **kwargs) -> TranslationLine: return TranslationLine(data["parts"], data["language"], data["extent"])
def test_linguistic_gloss(): parts = [Reading.of_name("kur"), Joiner.hyphen(), Reading.of_name("kur")] gloss = LinguisticGloss.of(parts) expected_value = f"{{{{{''.join(part.value for part in parts)}}}}}" expected_clean_value = f"{{{{{''.join(part.clean_value for part in parts)}}}}}" expected_parts = f"⟨{'⁚'.join(part.get_key() for part in parts)}⟩" assert gloss.value == expected_value assert gloss.clean_value == expected_clean_value assert gloss.get_key() == f"LinguisticGloss⁝{expected_value}{expected_parts}" assert gloss.parts == tuple(parts) assert gloss.lemmatizable is False serialized = { "type": "LinguisticGloss", "parts": OneOfTokenSchema().dump(parts, many=True), } assert_token_serialization(gloss, serialized)
def test_determinative(): parts = [Reading.of_name("kur"), Joiner.hyphen(), Reading.of_name("kur")] determinative = Determinative.of(parts) expected_value = f"{{{''.join(part.value for part in parts)}}}" expected_clean_value = f"{{{''.join(part.clean_value for part in parts)}}}" expected_parts = f"⟨{'⁚'.join(part.get_key() for part in parts)}⟩" assert determinative.value == expected_value assert determinative.clean_value == expected_clean_value assert determinative.get_key() == f"Determinative⁝{expected_value}{expected_parts}" assert determinative.parts == tuple(parts) assert determinative.lemmatizable is False serialized = { "type": "Determinative", "parts": OneOfTokenSchema().dump(parts, many=True), } assert_token_serialization(determinative, serialized)
def test_akkadian_word(word: AkkadianWord, expected: str, lemmatizable: bool) -> None: assert word.value == expected assert word.clean_value == expected.translate( str.maketrans("", "", "[]()<>#?!")) assert word.lemmatizable is lemmatizable assert word.alignable is lemmatizable serialized = { "type": "AkkadianWord", "parts": OneOfTokenSchema().dump(word.parts, many=True), "modifiers": [modifier.value for modifier in word.modifiers], "uniqueLemma": [], "alignment": None, "variant": None, "lemmatizable": lemmatizable, "alignable": lemmatizable, "normalized": True, "language": "AKKADIAN", } assert_token_serialization(word, serialized)
def test_variant(): reading = Reading.of([ValueToken.of("sa"), BrokenAway.open(), ValueToken.of("l")]) divider = Divider.of(":") variant = Variant.of(reading, divider) expected_value = "sa[l/:" assert variant.value == expected_value assert variant.clean_value == "sal/:" assert variant.tokens == (reading, divider) assert variant.parts == variant.tokens assert ( variant.get_key() == f"Variant⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in variant.tokens)}⟩" ) assert variant.lemmatizable is False serialized = { "type": "Variant", "tokens": OneOfTokenSchema().dump([reading, divider], many=True), } assert_token_serialization(variant, serialized)
def test_lone_determinative(language): value = "{mu}" parts = [Determinative.of([Reading.of_name("mu")])] lone_determinative = LoneDeterminative.of(parts, language) equal = LoneDeterminative.of(parts, language) other_language = LoneDeterminative.of(parts, Language.UNKNOWN) other_parts = LoneDeterminative.of( [Determinative.of([Reading.of_name("bu")])], language) assert lone_determinative.value == value assert lone_determinative.lemmatizable is False assert lone_determinative.language == language assert lone_determinative.normalized is False assert lone_determinative.unique_lemma == tuple() serialized = { "type": "LoneDeterminative", "uniqueLemma": [], "normalized": False, "language": lone_determinative.language.name, "lemmatizable": lone_determinative.lemmatizable, "alignable": lone_determinative.lemmatizable, "erasure": ErasureState.NONE.name, "parts": OneOfTokenSchema().dump(parts, many=True), } assert_token_serialization(lone_determinative, serialized) assert lone_determinative == equal assert hash(lone_determinative) == hash(equal) for not_equal in [other_language, other_parts]: assert lone_determinative != not_equal assert hash(lone_determinative) != hash(not_equal) assert lone_determinative != ValueToken.of(value)
def create(include_documents: bool) -> Tuple[Chapter, dict]: references = (ReferenceFactory.build(with_document=include_documents),) manuscript = ManuscriptFactory.build(references=references) first_manuscript_line = ManuscriptLineFactory.build(manuscript_id=manuscript.id) second_manuscript_line = ManuscriptLineFactory.build(manuscript_id=manuscript.id) line = LineFactory.build( variants=( LineVariantFactory.build( manuscripts=(first_manuscript_line, second_manuscript_line), parallel_lines=(ParallelComposition(False, "name", LineNumber(1)),), ), ) ) chapter = ChapterFactory.build( manuscripts=(manuscript,), uncertain_fragments=(MuseumNumber.of("K.1"),), lines=(line,), ) dto = { "textId": { "genre": chapter.text_id.genre.value, "category": chapter.text_id.category, "index": chapter.text_id.index, }, "classification": chapter.classification.value, "stage": chapter.stage.value, "version": chapter.version, "name": chapter.name, "order": chapter.order, "signs": list(chapter.signs), "record": RecordSchema().dump(chapter.record), "parserVersion": chapter.parser_version, "manuscripts": ApiManuscriptSchema( exclude=[] if include_documents else ["joins"] ).dump(chapter.manuscripts, many=True), "uncertainFragments": [str(number) for number in chapter.uncertain_fragments], "lines": [ { "number": line.number.label, "variants": [ { "reconstruction": "".join( [ convert_to_atf(None, variant.reconstruction), f"\n{variant.note.atf}" if variant.note else "", *[ f"\n{parallel_line.atf}" for parallel_line in variant.parallel_lines ], ] ), "reconstructionTokens": OneOfTokenSchema().dump( variant.reconstruction, many=True ), "intertext": "".join(part.value for part in variant.intertext), "manuscripts": [ { "manuscriptId": manuscript_line.manuscript_id, "labels": [ label.to_value() for label in manuscript_line.labels ], "number": manuscript_line.line.line_number.atf[:-1] if isinstance(manuscript_line.line, TextLine) else "", "atf": "\n".join( [ manuscript_line.line.atf[ len(manuscript_line.line.line_number.atf) + 1 : ] if isinstance(manuscript_line.line, TextLine) else "", *[ line.atf for line in manuscript_line.paratext ], ] ).strip(), "atfTokens": ( OneOfLineSchema().dump(manuscript_line.line)[ "content" ] ), "omittedWords": list(manuscript_line.omitted_words), } for manuscript_line in variant.manuscripts ], } for variant in line.variants ], "isSecondLineOfParallelism": line.is_second_line_of_parallelism, "isBeginningOfSection": line.is_beginning_of_section, "translation": "\n".join( translation.atf for translation in line.translation ), } for line in chapter.lines ], } return chapter, dto
ParallelFragment, ParallelText, ) from ebl.transliteration.domain.sign_tokens import Reading from ebl.transliteration.domain.text_line import TextLine from ebl.transliteration.domain.tokens import ErasureState, ValueToken from ebl.transliteration.domain.word_tokens import LoneDeterminative, Word from ebl.transliteration.domain.translation_line import Extent, TranslationLine from ebl.transliteration.domain.atf import Surface LINES = [ ( CompositeAtLine(atf.Composite.MILESTONE, "o"), { "prefix": "@", "content": [OneOfTokenSchema().dump(ValueToken.of("m=locator o"))], "type": "CompositeAtLine", "composite": "MILESTONE", "text": "o", "number": None, "displayValue": "m=locator o", }, ), ( CompositeAtLine(atf.Composite.MILESTONE, "o", 1), { "prefix": "@", "content": [OneOfTokenSchema().dump(ValueToken.of("m=locator o 1"))], "type": "CompositeAtLine", "composite": "MILESTONE", "text": "o",
def to_dict(chapter: Chapter, include_documents=False): return { "textId": { "genre": chapter.text_id.genre.value, "category": chapter.text_id.category, "index": chapter.text_id.index, }, "classification": chapter.classification.value, "stage": chapter.stage.value, "version": chapter.version, "name": chapter.name, "order": chapter.order, "signs": list(chapter.signs), "record": RecordSchema().dump(chapter.record), "parserVersion": chapter.parser_version, "manuscripts": [{ "id": manuscript.id, "siglumDisambiguator": manuscript.siglum_disambiguator, "museumNumber": ((str(manuscript.museum_number) if manuscript.museum_number else "") if include_documents else manuscript.museum_number and MuseumNumberSchema().dump(manuscript.museum_number)), "accession": manuscript.accession, "periodModifier": manuscript.period_modifier.value, "period": manuscript.period.long_name, "provenance": manuscript.provenance.long_name, "type": manuscript.type.long_name, "notes": manuscript.notes, "colophon": TextSchema().dump(manuscript.colophon), "unplacedLines": TextSchema().dump(manuscript.unplaced_lines), "references": (ApiReferenceSchema if include_documents else ReferenceSchema)().dump(manuscript.references, many=True), } for manuscript in chapter.manuscripts], "uncertainFragments": MuseumNumberSchema().dump(UNCERTAIN_FRAGMENTS, many=True), "lines": [{ "number": OneOfLineNumberSchema().dump(line.number), "variants": [{ "reconstruction": OneOfTokenSchema().dump(variant.reconstruction, many=True), "note": variant.note and NoteLineSchema().dump(variant.note), "parallelLines": ParallelLineSchema().dump(variant.parallel_lines, many=True), "intertext": OneOfNoteLinePartSchema().dump(variant.intertext, many=True), "manuscripts": [{ "manuscriptId": manuscript_line.manuscript_id, "labels": [label.to_value() for label in manuscript_line.labels], "line": OneOfLineSchema().dump(manuscript_line.line), "paratext": OneOfLineSchema().dump(manuscript_line.paratext, many=True), "omittedWords": list(manuscript_line.omitted_words), } for manuscript_line in variant.manuscripts], } for variant in line.variants], "isSecondLineOfParallelism": line.is_second_line_of_parallelism, "isBeginningOfSection": line.is_beginning_of_section, "translation": TranslationLineSchema().dump(line.translation, many=True), } for line in chapter.lines], }