def make_token(self, data, **kwargs): return ( Reading.of( data["name_parts"], data["sub_index"], data["modifiers"], data["flags"], data["sign"], ) .set_enclosure_type(frozenset(data["enclosure_type"])) .set_erasure(data["erasure"]) )
def test_text_line_of_iterable(code: str, language: Language) -> None: tokens = [ Word.of([Reading.of_name("first")]), LanguageShift.of(code), Word.of([Reading.of_name("second")]), LanguageShift.of("%sb"), LoneDeterminative.of([Determinative.of([Reading.of_name("third")])]), Word.of([BrokenAway.open(), Reading.of_name("fourth")]), UnknownNumberOfSigns.of(), BrokenAway.close(), ] expected_tokens = ( Word.of([Reading.of_name("first")], DEFAULT_LANGUAGE), LanguageShift.of(code), Word.of([Reading.of_name("second")], language), LanguageShift.of("%sb"), LoneDeterminative.of([Determinative.of([Reading.of_name("third")])], Language.AKKADIAN), Word.of( [ BrokenAway.open(), Reading.of((ValueToken( frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE, "fourth", ), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY })), ], DEFAULT_LANGUAGE, ), UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), ) line = TextLine.of_iterable(LINE_NUMBER, tokens) assert line.line_number == LINE_NUMBER assert line.content == expected_tokens assert ( line.key == f"TextLine⁞{line.atf}⟨{'⁚'.join(token.get_key() for token in expected_tokens)}⟩" ) assert line.atf == f"1. first {code} second %sb {{third}} [fourth ...]"
def expected_transliteration(language: Language) -> Sequence[Token]: return ( Word.of([Reading.of_name("bu")], language), LanguageShift.of("%es"), Word.of( [ BrokenAway.open(), Reading.of((ValueToken( frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE, "kur", ), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY })), ], Language.EMESAL, ), UnknownNumberOfSigns(frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), )
def test_variant(): reading = Reading.of([ValueToken.of("sa"), BrokenAway.open(), ValueToken.of("l")]) divider = Divider.of(":") variant = Variant.of(reading, divider) expected_value = "sa[l/:" assert variant.value == expected_value assert variant.clean_value == "sal/:" assert variant.tokens == (reading, divider) assert variant.parts == variant.tokens assert ( variant.get_key() == f"Variant⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in variant.tokens)}⟩" ) assert variant.lemmatizable is False serialized = { "type": "Variant", "tokens": OneOfTokenSchema().dump([reading, divider], many=True), } assert_token_serialization(variant, serialized)
def test_reading( name_parts, sub_index, modifiers, flags, sign, expected_value, expected_clean_value, expected_name, ) -> None: reading = Reading.of(name_parts, sub_index, modifiers, flags, sign) expected_parts = (*name_parts, sign) if sign else name_parts assert reading.value == expected_value assert reading.clean_value == expected_clean_value assert ( reading.get_key() == f"Reading⁝{expected_value}⟨{'⁚'.join(token.get_key() for token in expected_parts)}⟩" ) assert reading.name_parts == name_parts assert reading.name == expected_name assert reading.modifiers == tuple(modifiers) assert reading.flags == tuple(flags) assert reading.lemmatizable is False assert reading.sign == sign serialized = { "type": "Reading", "name": expected_name, "nameParts": OneOfTokenSchema().dump(name_parts, many=True), "subIndex": sub_index, "modifiers": modifiers, "flags": [flag.value for flag in flags], "sign": sign and OneOfTokenSchema().dump(sign), } assert_token_serialization(reading, serialized)
def ebl_atf_text_line__reading(self, name, sub_index, modifiers, flags, sign=None): return Reading.of(tuple(name.children), sub_index, modifiers, flags, sign)
class LemmatizedFragmentFactory(TransliteratedFragmentFactory): text = Text(( TextLine.of_iterable( LineNumber(1, True), ( Word.of([UnidentifiedSign.of()]), Word.of([ Logogram.of_name( "BA", surrogate=[ Reading.of_name("ku"), Joiner.hyphen(), Reading.of_name("u", 4), ], ) ]), Column.of(), Tabulation.of(), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), Joiner.hyphen(), Reading.of_name("ku"), BrokenAway.close(), Joiner.hyphen(), Reading.of_name("nu"), Joiner.hyphen(), Reading.of_name("ši"), ]), Variant.of(Divider.of(":"), Reading.of_name("ku")), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Column.of(2), Divider.of(":", ("@v", ), (Flag.DAMAGE, )), CommentaryProtocol.of("!qt"), Word.of([Number.of_name("10", flags=[Flag.DAMAGE])]), ), ), TextLine.of_iterable( LineNumber(2, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of([Logogram.of_name("GI", 6)], unique_lemma=(WordId("ginâ I"), )), Word.of([Reading.of_name("ana")], unique_lemma=(WordId("ana I"), )), Word.of( [ Reading.of_name("u₄"), Joiner.hyphen(), Reading.of_name("š[u"), ], unique_lemma=(WordId("ūsu I"), ), ), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(3, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of( unique_lemma=(WordId("kīdu I"), ), parts=[ Reading.of(( ValueToken.of("k"), BrokenAway.close(), ValueToken.of("i"), )), Joiner.hyphen(), Reading.of_name("du"), ], ), Word.of(unique_lemma=(WordId("u I"), ), parts=[Reading.of_name("u")]), Word.of( unique_lemma=(WordId("bamātu I"), ), parts=[ Reading.of_name("ba"), Joiner.hyphen(), Reading.of_name("ma"), Joiner.hyphen(), Reading.of(( ValueToken.of("t"), BrokenAway.open(), ValueToken.of("i"), )), ], ), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(6, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([UnclearSign.of([Flag.DAMAGE])]), Word.of(unique_lemma=(WordId("mu I"), ), parts=[Reading.of_name("mu")]), Word.of( unique_lemma=(WordId("tamalāku I"), ), parts=[ Reading.of_name("ta"), Joiner.hyphen(), Reading.of_name("ma"), InWordNewline.of(), Joiner.hyphen(), Reading.of_name("tu", 2), ], ), ), ), TextLine.of_iterable( LineNumber(7, True), ( Word.of([ Variant.of(Reading.of_name("šu"), CompoundGrapheme.of(["BI×IS"])) ]), LanguageShift.normalized_akkadian(), AkkadianWord.of([ValueToken.of("kur")], unique_lemma=(WordId("normalized I"), )), ), ), StateDollarLine( atf.Qualification.AT_LEAST, 1, ScopeContainer(atf.Surface.OBVERSE, ""), atf.State.MISSING, None, ), ImageDollarLine("1", None, "numbered diagram of triangle"), RulingDollarLine(atf.Ruling.SINGLE), LooseDollarLine("this is a loose line"), SealDollarLine(1), SealAtLine(1), HeadingAtLine(1), ColumnAtLine(ColumnLabel([atf.Status.COLLATION], 1)), SurfaceAtLine( SurfaceLabel([atf.Status.COLLATION], atf.Surface.SURFACE, "stone wig")), ObjectAtLine( ObjectLabel([atf.Status.COLLATION], atf.Object.OBJECT, "stone wig")), DiscourseAtLine(atf.Discourse.DATE), DivisionAtLine("paragraph", 5), CompositeAtLine(atf.Composite.DIV, "part", 1), NoteLine(( StringPart("a note "), EmphasisPart("italic"), LanguagePart.of_transliteration( Language.AKKADIAN, (Word.of([Reading.of_name("bu")]), )), )), ParallelComposition(False, "my name", LineNumber(1)), ParallelText( True, TextId(CorpusGenre.LITERATURE, 1, 1), ChapterName(Stage.OLD_BABYLONIAN, "", "my name"), LineNumber(1), False, ), ParallelFragment(False, MuseumNumber.of("K.1"), True, Labels(), LineNumber(1), False), ))
class TransliteratedFragmentFactory(FragmentFactory): text = Text(( TextLine.of_iterable( LineNumber(1, True), ( Word.of([UnidentifiedSign.of()]), Word.of([ Logogram.of_name( "BA", surrogate=[ Reading.of_name("ku"), Joiner.hyphen(), Reading.of_name("u", 4), ], ) ]), Column.of(), Tabulation.of(), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), Joiner.hyphen(), Reading.of_name("ku"), BrokenAway.close(), Joiner.hyphen(), Reading.of_name("nu"), Joiner.hyphen(), Reading.of_name("ši"), ]), Variant.of(Divider.of(":"), Reading.of_name("ku")), Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Column.of(2), Divider.of(":", ("@v", ), (Flag.DAMAGE, )), CommentaryProtocol.of("!qt"), Word.of([Number.of_name("10", flags=[Flag.DAMAGE])]), ), ), TextLine.of_iterable( LineNumber(2, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([Logogram.of_name("GI", 6)]), Word.of([Reading.of_name("ana")]), Word.of([ Reading.of_name("u", 4), Joiner.hyphen(), Reading.of(( ValueToken.of("š"), BrokenAway.open(), ValueToken.of("u"), )), ]), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(3, True), ( Word.of([BrokenAway.open(), UnknownNumberOfSigns.of()]), Word.of([ Reading.of(( ValueToken.of("k"), BrokenAway.close(), ValueToken.of("i"), )), Joiner.hyphen(), Reading.of_name("du"), ]), Word.of([Reading.of_name("u")]), Word.of([ Reading.of_name("ba"), Joiner.hyphen(), Reading.of_name("ma"), Joiner.hyphen(), Reading.of(( ValueToken.of("t"), BrokenAway.open(), ValueToken.of("i"), )), ]), Word.of([UnknownNumberOfSigns.of(), BrokenAway.close()]), ), ), TextLine.of_iterable( LineNumber(6, True), ( Word.of([ BrokenAway.open(), UnknownNumberOfSigns.of(), BrokenAway.close(), ]), Word.of([UnclearSign.of([Flag.DAMAGE])]), Word.of([Reading.of_name("mu")]), Word.of([ Reading.of_name("ta"), Joiner.hyphen(), Reading.of_name("ma"), InWordNewline.of(), Joiner.hyphen(), Reading.of_name("tu", 2), ]), ), ), TextLine.of_iterable( LineNumber(7, True), ( Word.of([ Variant.of(Reading.of_name("šu"), CompoundGrapheme.of(["BI×IS"])) ]), LanguageShift.normalized_akkadian(), AkkadianWord.of([ValueToken.of("kur")]), ), ), StateDollarLine( atf.Qualification.AT_LEAST, 1, ScopeContainer(atf.Surface.OBVERSE, ""), atf.State.MISSING, None, ), ImageDollarLine("1", None, "numbered diagram of triangle"), RulingDollarLine(atf.Ruling.SINGLE), LooseDollarLine("this is a loose line"), SealDollarLine(1), SealAtLine(1), HeadingAtLine(1), ColumnAtLine(ColumnLabel([atf.Status.COLLATION], 1)), SurfaceAtLine( SurfaceLabel([atf.Status.COLLATION], atf.Surface.SURFACE, "stone wig")), ObjectAtLine( ObjectLabel([atf.Status.COLLATION], atf.Object.OBJECT, "stone wig")), DiscourseAtLine(atf.Discourse.DATE), DivisionAtLine("paragraph", 5), CompositeAtLine(atf.Composite.DIV, "part", 1), NoteLine(( StringPart("a note "), EmphasisPart("italic"), LanguagePart.of_transliteration( Language.AKKADIAN, (Word.of([Reading.of_name("bu")]), )), )), ParallelComposition(False, "my name", LineNumber(1)), ParallelText( True, TextId(CorpusGenre.LITERATURE, 1, 1), ChapterName(Stage.OLD_BABYLONIAN, "", "my name"), LineNumber(1), False, ), ParallelFragment(False, MuseumNumber.of("K.1"), True, Labels(), LineNumber(1), False), )) signs = ( "X BA KU ABZ075 ABZ207a\\u002F207b\\u0020X ABZ377n1/KU ABZ377n1 ABZ411\n" "MI DIŠ UD ŠU\n" "KI DU ABZ411 BA MA TI\n" "X MU TA MA UD\n" "ŠU/|BI×IS|") folios = Folios((Folio("WGL", "3"), Folio("XXX", "3"))) record = Record((RecordEntry("test", RecordType.TRANSLITERATION), )) line_to_vec = (( LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.TEXT_LINE, LineToVecEncoding.SINGLE_RULING, ), )
from ebl.corpus.domain.manuscript import Manuscript from ebl.transliteration.domain.text_id import TextId from ebl.transliteration.domain.line_number import LineNumber from ebl.transliteration.domain.sign_tokens import Reading from ebl.transliteration.domain.text_line import TextLine from ebl.transliteration.domain.tokens import ValueToken from ebl.transliteration.domain.word_tokens import Word from ebl.transliteration.domain.genre import Genre from ebl.transliteration.domain.labels import SurfaceLabel from ebl.transliteration.domain.atf import Surface from ebl.corpus.web.extant_lines import ExtantLinesSchema from ebl.transliteration.application.line_number_schemas import OneOfLineNumberSchema LABELS = (SurfaceLabel.from_label(Surface.OBVERSE), ) MANUSCRIPT_TEXT_1 = TextLine(LineNumber(2), (Word.of([Reading.of([ValueToken.of("ku")])]), )) def test_extant_lines_schema() -> None: manuscript = Manuscript(1) manuscript_line = ManuscriptLine(1, LABELS, MANUSCRIPT_TEXT_1) variant = LineVariant(tuple(), manuscripts=(manuscript_line, )) text_line = Line(LineNumber(1), (variant, )) chapter = Chapter(TextId(Genre.LITERATURE, 0, 0), manuscripts=(manuscript, ), lines=(text_line, )) assert ExtantLinesSchema().dump(chapter) == { "extantLines": { str(manuscript.siglum): { " ".join(label.to_value() for label in manuscript_line.labels): [{
) ], ), ( "1. [... r]u?-u₂-qu na-a[n-...]\n2. ši-[ku-...-ku]-nu\n3. [...]-ku", [ TextLine.of_iterable( LineNumber(1), ( Word.of( (BrokenAway.open(), UnknownNumberOfSigns.of())), Word.of(parts=[ Reading.of( ( ValueToken.of("r"), BrokenAway.close(), ValueToken.of("u"), ), flags=[atf.Flag.UNCERTAIN], ), Joiner.hyphen(), Reading.of_name("u", 2), Joiner.hyphen(), Reading.of_name("qu"), ]), Word.of(parts=[ Reading.of_name("na"), Joiner.hyphen(), Reading.of(( ValueToken.of("a"), BrokenAway.open(), ValueToken.of("n"),
fragment_repository.create(ANOTHER_LEMMATIZED_FRAGMENT) assert lemma_repository.query_lemmas("ana", False) == [["ana II"], ["ana I"]] @pytest.mark.parametrize( "parts,expected", [ ( [ Reading.of( [ValueToken.of("ana")], flags=[ Flag.DAMAGE, Flag.COLLATION, Flag.UNCERTAIN, Flag.CORRECTION, ], ) ], [["ana I"]], ), ( [ BrokenAway.open(), PerhapsBrokenAway.open(), Reading.of([ValueToken.of("ana")]), PerhapsBrokenAway.close(), BrokenAway.close(), ],
) ]), ), ( Text.of_iterable([ControlLine("$", " double ruling")]), Text.of_iterable([RulingDollarLine(atf.Ruling.DOUBLE)]), Text.of_iterable([RulingDollarLine(atf.Ruling.DOUBLE)]), ), ( Text.of_iterable([ TextLine.of_iterable( LineNumber(1), [ Word.of([ Variant.of( Reading.of([ValueToken.of("k[ur")]), Reading.of([ValueToken.of("r[a")]), ) ]), BrokenAway.close(), ], ) ]), Text.of_iterable([ TextLine.of_iterable( LineNumber(1), [ Word.of([ Variant.of( Reading.of([ ValueToken.of("k"),
{ "type": "TextLine", "prefix": "1.", "lineNumber": OneOfLineNumberSchema().dump(LineNumber(1)), "content": OneOfTokenSchema().dump( [ DocumentOrientedGloss.open(), Word.of( [ Reading.of( ( ValueToken( frozenset( {EnclosureType.DOCUMENT_ORIENTED_GLOSS} ), ErasureState.NONE, "bu", ), ) ).set_enclosure_type( frozenset({EnclosureType.DOCUMENT_ORIENTED_GLOSS}) ) ] ).set_enclosure_type( frozenset({EnclosureType.DOCUMENT_ORIENTED_GLOSS}) ), LoneDeterminative.of( [ Determinative.of( [ Reading.of(
frozenset({ EnclosureType.DOCUMENT_ORIENTED_GLOSS })), )).set_enclosure_type( frozenset({EnclosureType.DOCUMENT_ORIENTED_GLOSS})), DocumentOrientedGloss.close().set_enclosure_type( frozenset({EnclosureType.DOCUMENT_ORIENTED_GLOSS})), ), ), (( "ku[r ...]", ( Word.of((Reading.of(( ValueToken.of("ku"), BrokenAway.open(), ValueToken( frozenset({EnclosureType.BROKEN_AWAY}), ErasureState.NONE, "r", ), )), )), Word.of(( UnknownNumberOfSigns.of().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), BrokenAway.close().set_enclosure_type( frozenset({EnclosureType.BROKEN_AWAY})), )).set_enclosure_type(frozenset({EnclosureType.BROKEN_AWAY})), ), )), (( "{k[ur}-X]", (Word.of((
(LINE, LINE, LINE), ( Line( LineNumber(1), (LineVariant( RECONSTRUCTION, NOTE, (ManuscriptLine( MANUSCRIPT_ID, LABELS, TextLine( LineNumber(1), (Word.of( [ Reading.of([ ValueToken.of("ku"), BrokenAway.close(), ]), Joiner.hyphen(), Reading.of_name("nu"), Joiner.hyphen(), Reading.of_name("si"), ], unique_lemma=(WordId("word"), ), alignment=0, ), ), ), ), ), ), ), IS_SECOND_LINE_OF_PARALLELISM, IS_BEGINNING_OF_SECTION, ),
("x:ti", Word.of([UnclearSign.of(), Joiner.colon(), Reading.of_name("ti")])), ( "ti-X", Word.of([ Reading.of_name("ti"), Joiner.hyphen(), UnidentifiedSign.of() ]), ), ( "r]u-u₂-qu", Word.of([ Reading.of((ValueToken.of("r"), BrokenAway.close(), ValueToken.of("u"))), Joiner.hyphen(), Reading.of_name("u", 2), Joiner.hyphen(), Reading.of_name("qu"), ]), ), ( "ru?-u₂-qu", Word.of([ Reading.of_name("ru", flags=[atf.Flag.UNCERTAIN]), Joiner.hyphen(), Reading.of_name("u", 2), Joiner.hyphen(), Reading.of_name("qu"), ]),