def assert_token_serialization(token: Token, serialized: dict) -> None:
    with_common_properties = {
        "erasure": ErasureState.NONE.name,
        "value": token.value,
        "cleanValue": token.clean_value,
        "enclosureType": [type_.name for type_ in token.enclosure_type],
        **serialized,
    }
    assert OneOfTokenSchema().dump(token) == with_common_properties
    assert OneOfTokenSchema().load(with_common_properties) == token
Esempio n. 2
0
class ControlLineSchema(LineBaseSchema):
    prefix = fields.String(required=True)
    content = fields.Function(
        lambda obj: [OneOfTokenSchema().dump(ValueToken.of(obj.content))],
        lambda value: OneOfTokenSchema().load(value, many=True),
        required=True,
    )

    @post_load
    def make_line(self, data, **kwargs) -> ControlLine:
        return ControlLine(data["prefix"],
                           " ".join(token.value for token in data["content"]))
def test_akkadian_word(
    word: GreekWord,
    expected: str,
    language: Language,
    lemmatizable: bool,
    alignable: bool,
) -> None:
    assert word.value == expected
    assert word.clean_value == expected.translate(str.maketrans("", "", "[]()<>#?!"))
    assert word.language == language
    assert word.normalized is False
    assert word.lemmatizable is lemmatizable
    assert word.alignable is alignable

    serialized = {
        "type": "GreekWord",
        "parts": OneOfTokenSchema().dump(word.parts, many=True),
        "uniqueLemma": [],
        "alignment": None,
        "variant": None,
        "lemmatizable": lemmatizable,
        "alignable": alignable,
        "normalized": word.normalized,
        "language": language.name,
    }
    assert_token_serialization(word, serialized)
class DollarLineSchema(LineBaseSchema):
    prefix = fields.Constant("$")
    content = fields.Function(
        lambda obj:
        [OneOfTokenSchema().dump(ValueToken.of(f" {obj.display_value}"))],
        lambda value: value,
    )
Esempio n. 5
0
class ParallelLineSchema(LineBaseSchema):
    prefix = fields.Constant("//")
    content = fields.Function(
        lambda obj: [OneOfTokenSchema().dump(ValueToken.of(obj.display_value))],
        lambda value: value,
    )
    display_value = fields.String(data_key="displayValue")
    has_cf = fields.Boolean(data_key="hasCf", required=True)
Esempio n. 6
0
class NoteLineSchema(LineBaseSchema):
    prefix = fields.Constant("#note: ")
    content = fields.Function(
        lambda obj: OneOfTokenSchema().dump(
            [ValueToken.of(part.value) for part in obj.parts], many=True),
        lambda value: value,
    )
    parts = fields.List(fields.Nested(OneOfNoteLinePartSchema), required=True)

    @post_load
    def make_line(self, data, **kwargs) -> NoteLine:
        return NoteLine(data["parts"])
def test_logogram(
    name_parts,
    sub_index,
    modifiers,
    flags,
    sign,
    surrogate,
    expected_value,
    expected_clean_value,
    expected_name,
) -> None:
    logogram = Logogram.of(name_parts, sub_index, modifiers, flags, sign,
                           surrogate)

    expected_parts = (*name_parts, sign) if sign else name_parts
    assert logogram.value == expected_value
    assert logogram.clean_value == expected_clean_value
    assert (
        logogram.get_key() ==
        f"Logogram⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩"
    )
    assert logogram.name_parts == name_parts
    assert logogram.name == expected_name
    assert logogram.modifiers == tuple(modifiers)
    assert logogram.flags == tuple(flags)
    assert logogram.lemmatizable is False
    assert logogram.sign == sign
    assert logogram.surrogate == tuple(surrogate)

    serialized = {
        "type": "Logogram",
        "name": expected_name,
        "nameParts": OneOfTokenSchema().dump(name_parts, many=True),
        "subIndex": sub_index,
        "modifiers": modifiers,
        "flags": [flag.value for flag in flags],
        "surrogate": OneOfTokenSchema().dump(surrogate, many=True),
        "sign": sign and OneOfTokenSchema().dump(sign),
    }
    assert_token_serialization(logogram, serialized)
def test_number(
    name_parts,
    modifiers,
    flags,
    sign,
    expected_value,
    expected_clean_value,
    expected_name,
) -> None:
    number = Number.of(name_parts, modifiers, flags, sign)

    expected_sub_index = 1
    expected_parts = (*name_parts, sign) if sign else name_parts
    assert number.value == expected_value
    assert number.clean_value == expected_clean_value
    assert (
        number.get_key() ==
        f"Number⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩"
    )
    assert number.name_parts == name_parts
    assert number.name == expected_name
    assert number.sub_index == expected_sub_index
    assert number.modifiers == tuple(modifiers)
    assert number.flags == tuple(flags)
    assert number.lemmatizable is False
    assert number.sign == sign

    serialized = {
        "type": "Number",
        "name": expected_name,
        "nameParts": OneOfTokenSchema().dump(name_parts, many=True),
        "modifiers": modifiers,
        "subIndex": expected_sub_index,
        "flags": [flag.value for flag in flags],
        "sign": sign and OneOfTokenSchema().dump(sign),
    }
    assert_token_serialization(number, serialized)
def test_reading(
    name_parts,
    sub_index,
    modifiers,
    flags,
    sign,
    expected_value,
    expected_clean_value,
    expected_name,
) -> None:
    reading = Reading.of(name_parts, sub_index, modifiers, flags, sign)

    expected_parts = (*name_parts, sign) if sign else name_parts
    assert reading.value == expected_value
    assert reading.clean_value == expected_clean_value
    assert (
        reading.get_key() ==
        f"Reading⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩"
    )
    assert reading.name_parts == name_parts
    assert reading.name == expected_name
    assert reading.modifiers == tuple(modifiers)
    assert reading.flags == tuple(flags)
    assert reading.lemmatizable is False
    assert reading.sign == sign

    serialized = {
        "type": "Reading",
        "name": expected_name,
        "nameParts": OneOfTokenSchema().dump(name_parts, many=True),
        "subIndex": sub_index,
        "modifiers": modifiers,
        "flags": [flag.value for flag in flags],
        "sign": sign and OneOfTokenSchema().dump(sign),
    }
    assert_token_serialization(reading, serialized)
Esempio n. 10
0
class TranslationLineSchema(LineBaseSchema):
    prefix = fields.String()
    content = fields.Function(
        lambda obj: [
            OneOfTokenSchema().dump(
                ValueToken.of("".join(part.value for part in obj.parts)))
        ],
        lambda value: value,
    )

    parts = fields.List(fields.Nested(OneOfNoteLinePartSchema), required=True)
    language = fields.String(required=True)
    extent = fields.Nested(ExtentSchema, required=True, allow_none=True)

    @post_load
    def make_line(self, data, **kwargs) -> TranslationLine:
        return TranslationLine(data["parts"], data["language"], data["extent"])
def test_linguistic_gloss():
    parts = [Reading.of_name("kur"), Joiner.hyphen(), Reading.of_name("kur")]
    gloss = LinguisticGloss.of(parts)

    expected_value = f"{{{{{''.join(part.value for part in parts)}}}}}"
    expected_clean_value = f"{{{{{''.join(part.clean_value for part in parts)}}}}}"
    expected_parts = f"⟨{'⁚'.join(part.get_key() for part in parts)}⟩"
    assert gloss.value == expected_value
    assert gloss.clean_value == expected_clean_value
    assert gloss.get_key() == f"LinguisticGloss⁝{expected_value}{expected_parts}"
    assert gloss.parts == tuple(parts)
    assert gloss.lemmatizable is False

    serialized = {
        "type": "LinguisticGloss",
        "parts": OneOfTokenSchema().dump(parts, many=True),
    }
    assert_token_serialization(gloss, serialized)
def test_determinative():
    parts = [Reading.of_name("kur"), Joiner.hyphen(), Reading.of_name("kur")]
    determinative = Determinative.of(parts)

    expected_value = f"{{{''.join(part.value for part in parts)}}}"
    expected_clean_value = f"{{{''.join(part.clean_value for part in parts)}}}"
    expected_parts = f"⟨{'⁚'.join(part.get_key() for part in parts)}⟩"
    assert determinative.value == expected_value
    assert determinative.clean_value == expected_clean_value
    assert determinative.get_key() == f"Determinative⁝{expected_value}{expected_parts}"
    assert determinative.parts == tuple(parts)
    assert determinative.lemmatizable is False

    serialized = {
        "type": "Determinative",
        "parts": OneOfTokenSchema().dump(parts, many=True),
    }
    assert_token_serialization(determinative, serialized)
def test_akkadian_word(word: AkkadianWord, expected: str,
                       lemmatizable: bool) -> None:
    assert word.value == expected
    assert word.clean_value == expected.translate(
        str.maketrans("", "", "[]()<>#?!"))
    assert word.lemmatizable is lemmatizable
    assert word.alignable is lemmatizable

    serialized = {
        "type": "AkkadianWord",
        "parts": OneOfTokenSchema().dump(word.parts, many=True),
        "modifiers": [modifier.value for modifier in word.modifiers],
        "uniqueLemma": [],
        "alignment": None,
        "variant": None,
        "lemmatizable": lemmatizable,
        "alignable": lemmatizable,
        "normalized": True,
        "language": "AKKADIAN",
    }
    assert_token_serialization(word, serialized)
def test_variant():
    reading = Reading.of([ValueToken.of("sa"), BrokenAway.open(), ValueToken.of("l")])
    divider = Divider.of(":")
    variant = Variant.of(reading, divider)

    expected_value = "sa[l/:"
    assert variant.value == expected_value
    assert variant.clean_value == "sal/:"
    assert variant.tokens == (reading, divider)
    assert variant.parts == variant.tokens
    assert (
        variant.get_key()
        == f"Variant⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in variant.tokens)}⟩"
    )
    assert variant.lemmatizable is False

    serialized = {
        "type": "Variant",
        "tokens": OneOfTokenSchema().dump([reading, divider], many=True),
    }
    assert_token_serialization(variant, serialized)
def test_lone_determinative(language):
    value = "{mu}"
    parts = [Determinative.of([Reading.of_name("mu")])]
    lone_determinative = LoneDeterminative.of(parts, language)

    equal = LoneDeterminative.of(parts, language)
    other_language = LoneDeterminative.of(parts, Language.UNKNOWN)
    other_parts = LoneDeterminative.of(
        [Determinative.of([Reading.of_name("bu")])], language)

    assert lone_determinative.value == value
    assert lone_determinative.lemmatizable is False
    assert lone_determinative.language == language
    assert lone_determinative.normalized is False
    assert lone_determinative.unique_lemma == tuple()

    serialized = {
        "type": "LoneDeterminative",
        "uniqueLemma": [],
        "normalized": False,
        "language": lone_determinative.language.name,
        "lemmatizable": lone_determinative.lemmatizable,
        "alignable": lone_determinative.lemmatizable,
        "erasure": ErasureState.NONE.name,
        "parts": OneOfTokenSchema().dump(parts, many=True),
    }
    assert_token_serialization(lone_determinative, serialized)

    assert lone_determinative == equal
    assert hash(lone_determinative) == hash(equal)

    for not_equal in [other_language, other_parts]:
        assert lone_determinative != not_equal
        assert hash(lone_determinative) != hash(not_equal)

    assert lone_determinative != ValueToken.of(value)
def create(include_documents: bool) -> Tuple[Chapter, dict]:
    references = (ReferenceFactory.build(with_document=include_documents),)
    manuscript = ManuscriptFactory.build(references=references)

    first_manuscript_line = ManuscriptLineFactory.build(manuscript_id=manuscript.id)
    second_manuscript_line = ManuscriptLineFactory.build(manuscript_id=manuscript.id)
    line = LineFactory.build(
        variants=(
            LineVariantFactory.build(
                manuscripts=(first_manuscript_line, second_manuscript_line),
                parallel_lines=(ParallelComposition(False, "name", LineNumber(1)),),
            ),
        )
    )

    chapter = ChapterFactory.build(
        manuscripts=(manuscript,),
        uncertain_fragments=(MuseumNumber.of("K.1"),),
        lines=(line,),
    )
    dto = {
        "textId": {
            "genre": chapter.text_id.genre.value,
            "category": chapter.text_id.category,
            "index": chapter.text_id.index,
        },
        "classification": chapter.classification.value,
        "stage": chapter.stage.value,
        "version": chapter.version,
        "name": chapter.name,
        "order": chapter.order,
        "signs": list(chapter.signs),
        "record": RecordSchema().dump(chapter.record),
        "parserVersion": chapter.parser_version,
        "manuscripts": ApiManuscriptSchema(
            exclude=[] if include_documents else ["joins"]
        ).dump(chapter.manuscripts, many=True),
        "uncertainFragments": [str(number) for number in chapter.uncertain_fragments],
        "lines": [
            {
                "number": line.number.label,
                "variants": [
                    {
                        "reconstruction": "".join(
                            [
                                convert_to_atf(None, variant.reconstruction),
                                f"\n{variant.note.atf}" if variant.note else "",
                                *[
                                    f"\n{parallel_line.atf}"
                                    for parallel_line in variant.parallel_lines
                                ],
                            ]
                        ),
                        "reconstructionTokens": OneOfTokenSchema().dump(
                            variant.reconstruction, many=True
                        ),
                        "intertext": "".join(part.value for part in variant.intertext),
                        "manuscripts": [
                            {
                                "manuscriptId": manuscript_line.manuscript_id,
                                "labels": [
                                    label.to_value() for label in manuscript_line.labels
                                ],
                                "number": manuscript_line.line.line_number.atf[:-1]
                                if isinstance(manuscript_line.line, TextLine)
                                else "",
                                "atf": "\n".join(
                                    [
                                        manuscript_line.line.atf[
                                            len(manuscript_line.line.line_number.atf)
                                            + 1 :
                                        ]
                                        if isinstance(manuscript_line.line, TextLine)
                                        else "",
                                        *[
                                            line.atf
                                            for line in manuscript_line.paratext
                                        ],
                                    ]
                                ).strip(),
                                "atfTokens": (
                                    OneOfLineSchema().dump(manuscript_line.line)[
                                        "content"
                                    ]
                                ),
                                "omittedWords": list(manuscript_line.omitted_words),
                            }
                            for manuscript_line in variant.manuscripts
                        ],
                    }
                    for variant in line.variants
                ],
                "isSecondLineOfParallelism": line.is_second_line_of_parallelism,
                "isBeginningOfSection": line.is_beginning_of_section,
                "translation": "\n".join(
                    translation.atf for translation in line.translation
                ),
            }
            for line in chapter.lines
        ],
    }

    return chapter, dto
    ParallelFragment,
    ParallelText,
)
from ebl.transliteration.domain.sign_tokens import Reading
from ebl.transliteration.domain.text_line import TextLine
from ebl.transliteration.domain.tokens import ErasureState, ValueToken
from ebl.transliteration.domain.word_tokens import LoneDeterminative, Word
from ebl.transliteration.domain.translation_line import Extent, TranslationLine
from ebl.transliteration.domain.atf import Surface

LINES = [
    (
        CompositeAtLine(atf.Composite.MILESTONE, "o"),
        {
            "prefix": "@",
            "content": [OneOfTokenSchema().dump(ValueToken.of("m=locator o"))],
            "type": "CompositeAtLine",
            "composite": "MILESTONE",
            "text": "o",
            "number": None,
            "displayValue": "m=locator o",
        },
    ),
    (
        CompositeAtLine(atf.Composite.MILESTONE, "o", 1),
        {
            "prefix": "@",
            "content": [OneOfTokenSchema().dump(ValueToken.of("m=locator o 1"))],
            "type": "CompositeAtLine",
            "composite": "MILESTONE",
            "text": "o",
def to_dict(chapter: Chapter, include_documents=False):
    return {
        "textId": {
            "genre": chapter.text_id.genre.value,
            "category": chapter.text_id.category,
            "index": chapter.text_id.index,
        },
        "classification":
        chapter.classification.value,
        "stage":
        chapter.stage.value,
        "version":
        chapter.version,
        "name":
        chapter.name,
        "order":
        chapter.order,
        "signs":
        list(chapter.signs),
        "record":
        RecordSchema().dump(chapter.record),
        "parserVersion":
        chapter.parser_version,
        "manuscripts": [{
            "id":
            manuscript.id,
            "siglumDisambiguator":
            manuscript.siglum_disambiguator,
            "museumNumber":
            ((str(manuscript.museum_number) if manuscript.museum_number else
              "") if include_documents else manuscript.museum_number
             and MuseumNumberSchema().dump(manuscript.museum_number)),
            "accession":
            manuscript.accession,
            "periodModifier":
            manuscript.period_modifier.value,
            "period":
            manuscript.period.long_name,
            "provenance":
            manuscript.provenance.long_name,
            "type":
            manuscript.type.long_name,
            "notes":
            manuscript.notes,
            "colophon":
            TextSchema().dump(manuscript.colophon),
            "unplacedLines":
            TextSchema().dump(manuscript.unplaced_lines),
            "references": (ApiReferenceSchema if include_documents else
                           ReferenceSchema)().dump(manuscript.references,
                                                   many=True),
        } for manuscript in chapter.manuscripts],
        "uncertainFragments":
        MuseumNumberSchema().dump(UNCERTAIN_FRAGMENTS, many=True),
        "lines": [{
            "number":
            OneOfLineNumberSchema().dump(line.number),
            "variants": [{
                "reconstruction":
                OneOfTokenSchema().dump(variant.reconstruction, many=True),
                "note":
                variant.note and NoteLineSchema().dump(variant.note),
                "parallelLines":
                ParallelLineSchema().dump(variant.parallel_lines, many=True),
                "intertext":
                OneOfNoteLinePartSchema().dump(variant.intertext, many=True),
                "manuscripts": [{
                    "manuscriptId":
                    manuscript_line.manuscript_id,
                    "labels":
                    [label.to_value() for label in manuscript_line.labels],
                    "line":
                    OneOfLineSchema().dump(manuscript_line.line),
                    "paratext":
                    OneOfLineSchema().dump(manuscript_line.paratext,
                                           many=True),
                    "omittedWords":
                    list(manuscript_line.omitted_words),
                } for manuscript_line in variant.manuscripts],
            } for variant in line.variants],
            "isSecondLineOfParallelism":
            line.is_second_line_of_parallelism,
            "isBeginningOfSection":
            line.is_beginning_of_section,
            "translation":
            TranslationLineSchema().dump(line.translation, many=True),
        } for line in chapter.lines],
    }