def test_extent_before_translation() -> None: with pytest.raises(ValueError): Text.of_iterable( [ TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("bu")])]), TextLine.of_iterable(LineNumber(2), [Word.of([Reading.of_name("bu")])]), TranslationLine(tuple(), "en", Extent(LineNumber(1))), ] )
def test_extent_overlapping_languages() -> None: Text.of_iterable( [ TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("bu")])]), TranslationLine(tuple(), "en", Extent(LineNumber(2))), TextLine.of_iterable(LineNumber(2), [Word.of([Reading.of_name("bu")])]), TranslationLine(tuple(), "de"), ] )
def test_exent_overlapping() -> None: with pytest.raises(ValueError): Text.of_iterable( [ TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("bu")])]), TranslationLine(tuple(), extent=Extent(LineNumber(2))), TextLine.of_iterable(LineNumber(2), [Word.of([Reading.of_name("bu")])]), TranslationLine(tuple()), ] )
def parse_atf_lark(atf_): def parse_line_(line: str, line_number: int): try: parsed_line = parse_line(line) if line else EmptyLine() validate_line(parsed_line) return parsed_line, None except PARSE_ERRORS as ex: return (None, create_transliteration_error_data(ex, line, line_number)) def check_errors(pairs): errors = [error for line, error in pairs if error is not None] if any(errors): raise TransliterationError(errors) lines = atf_.split("\n") lines = list(dropwhile(lambda line: line == "", reversed(lines))) lines.reverse() lines = [parse_line_(line, number) for number, line in enumerate(lines)] check_errors(lines) lines = tuple(pair[0] for pair in lines) text = Text(lines, f"{atf.ATF_PARSER_VERSION}") if pydash.duplicates(text.labels): raise DataError("Duplicate labels.") else: return text
def test_update_lemmatization() -> None: tokens = [list(line) for line in TEXT.lemmatization.tokens] tokens[0][0] = LemmatizationToken(tokens[0][0].value, (WordId("nu I"),)) lemmatization = Lemmatization(tokens) expected = Text( ( TextLine( LineNumber(1), ( Word.of( unique_lemma=(WordId("nu I"),), parts=[ Reading.of_name("ha"), Joiner.hyphen(), Reading.of_name("am"), ], ), ), ), RulingDollarLine(atf.Ruling.SINGLE), ), TEXT.parser_version, ) assert TEXT.update_lemmatization(lemmatization) == expected
class TransliterationUpdate: text: Text = Text() notes: str = "" signs: str = attr.ib(default="") @signs.validator def _check_signs(self, _attribute, value) -> None: questionable_lines = self._get_questionable_lines(value) if questionable_lines: raise TransliterationError( [ {"description": "Invalid value", "lineNumber": line_number} for line_number in questionable_lines ] ) def _get_questionable_lines(self, value: str) -> List[int]: lines = [line.atf for line in self.text.lines] text_lines = [line.atf for line in self.text.text_lines] signs = value.split("\n") def get_line_number(text_line_number: int) -> int: line = text_lines[text_line_number] return lines.index(line) + 1 return [ get_line_number(index) for index, line in enumerate(signs) if "?" in line ]
def test_dump_line(): text = Text( ( TextLine.of_iterable( LineNumber(1), [ Word.of( parts=[ Reading.of_name("ha"), Joiner.hyphen(), Reading.of_name("am"), ] ) ], ), EmptyLine(), ControlLine("#", " comment"), ), "1.0.0", ) assert TextSchema().dump(text) == { "lines": OneOfLineSchema().dump(text.lines, many=True), "parser_version": text.parser_version, "numberOfLines": 1, }
def test_parse_ruling_dollar_line(prefix, ruling, expected_ruling, status, expected_status, status_space, parenthesis): ruling = f"{ruling} ruling" ruling_with_status = (f"{ruling} {status}" if (status and status_space) else f"{ruling}{status}") line = f"({ruling_with_status})" if parenthesis else ruling_with_status assert (parse_atf_lark(f"{prefix}{line}").lines == Text.of_iterable( [RulingDollarLine(expected_ruling, expected_status)]).lines)
def test_statistics(database, fragment_repository): database[COLLECTION].insert_many( [ SCHEMA.dump( FragmentFactory.build( text=Text( ( TextLine( LineNumber(1), ( Word.of([Reading.of_name("first")]), Word.of([Reading.of_name("line")]), ), ), ControlLine("#", "ignore"), EmptyLine(), ) ) ) ), SCHEMA.dump( FragmentFactory.build( text=Text( ( ControlLine("#", "ignore"), TextLine( LineNumber(1), (Word.of([Reading.of_name("second")]),) ), TextLine( LineNumber(2), (Word.of([Reading.of_name("third")]),) ), ControlLine("#", "ignore"), TextLine( LineNumber(3), (Word.of([Reading.of_name("fourth")]),) ), ) ) ) ), SCHEMA.dump(FragmentFactory.build(text=Text())), ] ) assert fragment_repository.count_transliterated_fragments() == 2 assert fragment_repository.count_lines() == 4
def text_with_labels(): return Text.of_iterable([ TextLine.of_iterable(LineNumber(1), [Word.of([Reading.of_name("bu")])]), ColumnAtLine(ColumnLabel.from_int(1)), SurfaceAtLine(SurfaceLabel([], atf.Surface.SURFACE, "Stone wig")), ObjectAtLine(ObjectLabel([], atf.Object.OBJECT, "Stone wig")), TextLine.of_iterable(LineNumber(2), [Word.of([Reading.of_name("bu")])]), ])
def test_load_line(lines): parser_version = "2.3.1" serialized_lines = OneOfLineSchema().dump(lines, many=True) assert TextSchema().load( { "lines": serialized_lines, "parser_version": parser_version, "numberOfLines": 1, } ) == Text.of_iterable(lines).set_parser_version(parser_version)
class InterestingFragmentFactory(FragmentFactory): collection = "Kuyunjik" # pyre-ignore[15] publication = "" # pyre-ignore[15] joins: Sequence[str] = tuple() text = Text() notes = "" uncurated_references = ( UncuratedReference("7(0)"), UncuratedReference("CAD 51", (34, 56)), UncuratedReference("7(1)"), ) references = tuple()
class Manuscript: id: int siglum_disambiguator: str = "" museum_number: Optional[MuseumNumber] = None accession: str = attr.ib(default="") period_modifier: PeriodModifier = PeriodModifier.NONE period: Period = Period.NEO_ASSYRIAN provenance: Provenance = attr.ib(default=Provenance.NINEVEH) type: ManuscriptType = ManuscriptType.LIBRARY notes: str = "" colophon: Text = Text() unplaced_lines: Text = Text() references: Sequence[Reference] = tuple() joins: Joins = Joins() is_in_fragmentarium: bool = False @accession.validator def validate_accession(self, _, value) -> None: if self.museum_number and value: raise ValueError("Accession given when museum number present.") @provenance.validator def validate_provenance(self, _, value) -> None: if is_invalid_standard_text(value, self.period, self.type): raise ValueError( "Manuscript must not have period and type when provenance is Standard Text." ) elif is_invalid_non_standard_text(value, self.period, self.type): raise ValueError( "Manuscript must have period and type unless provenance is Standard Text." ) @property def text_lines(self) -> Sequence[TextLine]: return [*self.colophon.text_lines, *self.unplaced_lines.text_lines] @property def siglum(self) -> Siglum: return Siglum(self.provenance, self.period, self.type, self.siglum_disambiguator)
def test_query_lemmas_ignores_in_value(parts, expected, fragment_repository, lemma_repository): fragment = FragmentFactory.build( text=Text.of_iterable([ TextLine.of_iterable( LineNumber(1), [Word.of(parts, unique_lemma=(WordId("ana I"), ))]) ]), signs="DIŠ", ) fragment_repository.create(fragment) assert lemma_repository.query_lemmas("ana", False) == expected
class ManuscriptFactory(factory.Factory): class Meta: model = Manuscript id = factory.Sequence(lambda n: n + 1) siglum_disambiguator = factory.Faker("word") museum_number = factory.Sequence( lambda n: MuseumNumber("M", str(n)) if pydash.is_odd(n) else None ) accession = factory.Sequence(lambda n: f"A.{n}" if pydash.is_even(n) else "") period_modifier = factory.fuzzy.FuzzyChoice(PeriodModifier) period = factory.fuzzy.FuzzyChoice(set(Period) - {Period.NONE}) provenance = factory.fuzzy.FuzzyChoice(set(Provenance) - {Provenance.STANDARD_TEXT}) type = factory.fuzzy.FuzzyChoice(set(ManuscriptType) - {ManuscriptType.NONE}) notes = factory.Faker("sentence") colophon = Transliteration.of_iterable( [TextLine.of_iterable(LineNumber(1, True), (Word.of([Reading.of_name("ku")]),))] ) unplaced_lines = Transliteration.of_iterable( [TextLine.of_iterable(LineNumber(1, True), (Word.of([Reading.of_name("nu")]),))] ) references = factory.List( [factory.SubFactory(ReferenceFactory, with_document=True)], TupleFactory )
def test_updating_manuscripts(corpus, text_repository, bibliography, changelog, signs, sign_repository, user, when) -> None: uncertain_fragments = (MuseumNumber.of("K.1"), ) updated_chapter = attr.evolve( CHAPTER, manuscripts=(attr.evolve( CHAPTER.manuscripts[0], colophon=Transliteration.of_iterable([ TextLine.of_iterable(LineNumber(1, True), (Word.of([Reading.of_name("ba")]), )) ]), unplaced_lines=Transliteration.of_iterable([ TextLine.of_iterable(LineNumber(1, True), (Word.of([Reading.of_name("ku")]), )) ]), notes="Updated manuscript.", ), ), uncertain_fragments=uncertain_fragments, signs=("KU ABZ075 ABZ207a\\u002F207b\\u0020X\nBA\nKU", ), ) expect_find_and_update_chapter( bibliography, changelog, CHAPTER_WITHOUT_DOCUMENTS, updated_chapter, signs, sign_repository, text_repository, user, when, ) manuscripts = (updated_chapter.manuscripts[0], ) assert (corpus.update_manuscripts(CHAPTER.id_, manuscripts, uncertain_fragments, user) == updated_chapter)
def test_parse_normalized_akkadain_shift() -> None: word = "ha" line = f"1. {word} %n {word} %sux {word}" expected = Text((TextLine.of_iterable( LineNumber(1), ( Word.of((Reading.of_name(word), ), DEFAULT_LANGUAGE), LanguageShift.normalized_akkadian(), AkkadianWord.of((ValueToken.of(word), )), LanguageShift.of("%sux"), Word.of((Reading.of_name(word), ), Language.SUMERIAN), ), ), )) assert parse_atf_lark(line).lines == expected.lines
def test_parse_atf_language_shifts(code: str, expected_language: Language) -> None: word = "ha-am" parts = [Reading.of_name("ha"), Joiner.hyphen(), Reading.of_name("am")] line = f"1. {word} {code} {word} %sb {word}" expected = Text((TextLine.of_iterable( LineNumber(1), ( Word.of(parts, DEFAULT_LANGUAGE), LanguageShift.of(code), Word.of(parts, expected_language), LanguageShift.of("%sb"), Word.of(parts, Language.AKKADIAN), ), ), )) assert parse_atf_lark(line).lines == expected.lines
def test_get(client, text_repository): chapter = ChapterFactory.build( lines=tuple(), manuscripts=( ManuscriptFactory.build(references=tuple()), ManuscriptFactory.build(colophon=Text(), references=tuple()), ManuscriptFactory.build(references=tuple()), ), ) text_repository.create_chapter(chapter) result = client.simulate_get(create_chapter_url(chapter, "/colophons")) assert result.status == falcon.HTTP_OK assert result.json == [{ "siglum": str(manuscript.siglum), "text": TextSchema().dump(manuscript.colophon), } for manuscript in chapter.manuscripts if not manuscript.colophon.is_empty]
def test_query_by_parallel_line_exists(database, fragment_repository): parallel_number = MuseumNumber.of("K.1") fragment = FragmentFactory.build( text=Text( ( ParallelFragment( False, parallel_number, True, Labels(), LineNumber(1), True ), ) ) ) parallel_fragment = FragmentFactory.build(number=parallel_number) database[COLLECTION].insert_many( [ FragmentSchema(exclude=["joins"]).dump(fragment), FragmentSchema(exclude=["joins"]).dump(parallel_fragment), ] ) assert fragment_repository.query_by_museum_number(fragment.number) == fragment
def test_parse_dividers() -> None: line, expected_tokens = ( r'1. :? :#! :# ::? :.@v /@19* :"@20@c ;@v@19!', [ TextLine.of_iterable( LineNumber(1), ( Divider.of(":", tuple(), (atf.Flag.UNCERTAIN, )), Divider.of(":", tuple(), (atf.Flag.DAMAGE, atf.Flag.CORRECTION)), Divider.of(":", tuple(), (atf.Flag.DAMAGE, )), Divider.of("::", tuple(), (atf.Flag.UNCERTAIN, )), Divider.of(":.", ("@v", ), tuple()), Divider.of("/", ("@19", ), (atf.Flag.COLLATION, )), Divider.of(':"', ("@20", "@c"), tuple()), Divider.of(";", ("@v", "@19"), (atf.Flag.CORRECTION, )), ), ) ], ) assert parse_atf_lark(line).lines == Text.of_iterable( expected_tokens).lines
def test_combinations( qualification, extent, scope, state, status, expected_qualification, expected_extent, expected_scope, expected_state, expected_status, ): line = " ".join(["$", qualification, extent, scope, state, status]) expected_line = StateDollarLine( expected_qualification, expected_extent, expected_scope, expected_state, expected_status, ) assert parse_atf_lark(line).lines == Text.of_iterable([expected_line ]).lines
def test_of_iterable() -> None: assert Text.of_iterable(LINES) == Text(LINES, atf.ATF_PARSER_VERSION)
ANOTHER_LEMMATIZED_FRAGMENT = attr.evolve( TransliteratedFragmentFactory.build(), text=Text( ( TextLine( LineNumber(1), ( Word.of( [Logogram.of_name("GI", 6)], unique_lemma=(WordId("ginâ I"),) ), Word.of([Reading.of_name("ana")], unique_lemma=(WordId("ana II"),)), Word.of([Reading.of_name("ana")], unique_lemma=(WordId("ana II"),)), Word.of( [ Reading.of_name("u", 4), Joiner.hyphen(), Reading.of_name("šu"), ], unique_lemma=(WordId("ūsu I"),), ), AkkadianWord.of( [ValueToken.of("ana")], unique_lemma=(WordId("normalized I"),) ), ), ), ) ), signs="MI DIŠ DIŠ UD ŠU", )
def test_set_text(): fragment = FragmentFactory.build() text = Text((ParallelCompositionFactory.build(), )) updated_fragment = fragment.set_text(text) assert updated_fragment.text == text
def test_text(): fragment = FragmentFactory.build() assert fragment.text == Text()
def test_parse_note_line() -> None: markup = "this is a note @i{italic text}@akk{kur}@sux{kur}" atf = f"#note: {markup}" expected_line = NoteLine(parse_markup(markup)) assert parse_atf_lark(atf).lines == Text.of_iterable([expected_line]).lines
def test_parse_state_dollar_line_surface_ambiguity(line, expected_line): assert parse_atf_lark(line).lines == Text.of_iterable([expected_line ]).lines
def test_parse_state_dollar_line(prefix, parenthesis, line, expected_line): atf_ = f"{prefix}({line})" if parenthesis else f"{prefix}{line}" assert parse_atf_lark(atf_).lines == Text.of_iterable([expected_line ]).lines
class Fragment: number: MuseumNumber accession: str = "" cdli_number: str = "" bm_id_number: str = "" publication: str = "" description: str = "" collection: str = "" script: str = "" museum: str = "" width: Measure = Measure() length: Measure = Measure() thickness: Measure = Measure() joins: Joins = Joins() record: Record = Record() folios: Folios = Folios() text: Text = Text() signs: str = "" notes: str = "" references: Sequence[Reference] = tuple() uncurated_references: Optional[Sequence[UncuratedReference]] = None genres: Sequence[Genre] = tuple() line_to_vec: Tuple[LineToVecEncodings, ...] = tuple() @property def is_lowest_join(self) -> bool: return (self.joins.lowest or self.number) == self.number def set_references(self, references: Sequence[Reference]) -> "Fragment": return attr.evolve(self, references=references) def set_text(self, text: Text) -> "Fragment": return attr.evolve(self, text=text) def update_lowest_join_transliteration( self, transliteration: TransliterationUpdate, user: User) -> "Fragment": if transliteration.text.is_empty or self.is_lowest_join: return self.update_transliteration(transliteration, user) else: raise NotLowestJoinError( "Transliteration must be empty unless fragment is the lowest in join." ) def update_transliteration(self, transliteration: TransliterationUpdate, user: User) -> "Fragment": record = self.record.add_entry(self.text.atf, transliteration.text.atf, user) text = self.text.merge(transliteration.text) return attr.evolve( self, text=text, notes=transliteration.notes, signs=transliteration.signs, record=record, line_to_vec=create_line_to_vec(text.lines), ) def set_genres(self, genres_new: Sequence[Genre]) -> "Fragment": return attr.evolve(self, genres=tuple(genres_new)) def update_lemmatization(self, lemmatization: Lemmatization) -> "Fragment": text = self.text.update_lemmatization(lemmatization) return attr.evolve(self, text=text) def get_matching_lines(self, query: TransliterationQuery) -> Lines: line_numbers = query.match(self.signs) lines = [line.atf for line in self.text.text_lines] return tuple( tuple(lines[numbers[0]:numbers[1] + 1]) for numbers, _ in groupby(line_numbers))