class MongoAnnotationsRepository(AnnotationsRepository): def __init__(self, database: Database) -> None: self._collection = MongoCollection(database, COLLECTION) def create_or_update(self, annotations: Annotations) -> None: self._collection.replace_one( AnnotationsSchema().dump(annotations), {"fragmentNumber": str(annotations.fragment_number)}, True, ) def query_by_museum_number(self, number: MuseumNumber) -> Annotations: try: result = self._collection.find_one({"fragmentNumber": str(number)}) return AnnotationsSchema().load(result, unknown=EXCLUDE) except NotFoundError: return Annotations(number) def retrieve_all_non_empty(self) -> List[Annotations]: result = self._collection.find_many( {"annotations": { "$exists": True, "$ne": [] }}) return AnnotationsSchema().load(result, unknown=EXCLUDE, many=True) def find_by_sign(self, sign: str) -> Sequence[Annotations]: query = {"$regex": re.escape(sign), "$options": "i"} result = self._collection.aggregate([ { "$match": { "annotations.data.signName": query } }, { "$project": { "fragmentNumber": 1, "annotations": { "$filter": { "input": "$annotations", "as": "annotation", "cond": { "$eq": ["$$annotation.data.signName", sign] }, } }, } }, ]) return AnnotationsSchema().load(result, many=True, unknown=EXCLUDE)
class Changelog: def __init__(self, database): self._collection = MongoCollection(database, "changelog") def create(self, resource_type, user_profile, old, new): entry = create_entry(user_profile, resource_type, old["_id"], list(dictdiffer.diff(old, new))) return self._collection.insert_one(entry)
class MongoLemmaRepository(LemmaRepository): def __init__(self, database): self._collection = MongoCollection(database, COLLECTION) def query_lemmas(self, word: str, is_normalized: bool) -> Sequence[Lemma]: cursor = self._collection.aggregate( aggregate_lemmas(word, is_normalized)) return [[WordId(unique_lemma) for unique_lemma in result["_id"]] for result in cursor]
class MongoParallelRepository(ParallelRepository): _fragments: MongoCollection _chapters: MongoCollection def __init__(self, database: Database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) def fragment_exists(self, museum_number: MuseumNumber) -> bool: return self._fragments.count_documents( museum_number_is(museum_number)) > 0 def find_implicit_chapter(self, text_id: TextId) -> ChapterName: try: chapter = next( self._chapters.find_many( { "textId.genre": text_id.genre.value, "textId.category": text_id.category, "textId.index": text_id.index, }, sort=[("order", 1)], projection={ "_id": False, "stage": True, "name": True, "version": True, }, )) return ChapterNameSchema().load(chapter) except StopIteration as error: raise NotFoundError( f"No chapters found for text {text_id}.") from error def chapter_exists(self, text_id: TextId, chapter_name: ChapterName) -> bool: return (self._chapters.count_documents({ "textId.genre": text_id.genre.value, "textId.category": text_id.category, "textId.index": text_id.index, "stage": chapter_name.stage.value, "name": chapter_name.name, }) > 0)
class MongoDictionary: def __init__(self, database): self._collection = MongoCollection(database, COLLECTION) self._changelog = Changelog(database) def create(self, document): return self._collection.insert_one(document) def find(self, id_): return self._collection.find_one_by_id(id_) def search(self, query): lemma = query.split(" ") cursor = self._collection.find_many({ "$or": [ { "lemma": lemma }, { "forms": { "$elemMatch": { "lemma": lemma } } }, { "meaning": { "$regex": re.escape(query) } }, ] }) return [word for word in cursor] def search_lemma(self, query): cursor = self._collection.aggregate( _create_lemma_search_pipeline(query), collation={ "locale": "en", "strength": 1, "normalization": True }, ) return [word for word in cursor] def update(self, word, user): old_word = self.find(word["_id"]) self._changelog.create(COLLECTION, user.profile, old_word, word) self._collection.update_one({"_id": word["_id"]}, {"$set": word})
class MongoWordRepository(WordRepository): def __init__(self, database): self._collection = MongoCollection(database, COLLECTION) self._changelog = Changelog(database) def create(self, document): return self._collection.insert_one(document) def query_by_id(self, id_: WordId): return self._collection.find_one_by_id(id_) def query_by_lemma_form_or_meaning(self, query: str) -> Sequence: lemma = query.split(" ") cursor = self._collection.find_many({ "$or": [ { "lemma": lemma }, { "forms": { "$elemMatch": { "lemma": lemma } } }, { "meaning": { "$regex": re.escape(query) } }, ] }) return [word for word in cursor] def query_by_lemma_prefix(self, query: str) -> Sequence: cursor = self._collection.aggregate( _create_lemma_search_pipeline(query), collation={ "locale": "en", "strength": 1, "normalization": True }, ) return [word for word in cursor] def update(self, word) -> None: self._collection.update_one({"_id": word["_id"]}, {"$set": word})
def __init__(self, database: Database): self._texts = MongoCollection(database, TEXTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
class MongoTextRepository(TextRepository): def __init__(self, database: Database): self._texts = MongoCollection(database, TEXTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION) def create_indexes(self) -> None: self._texts.create_index( [ ("genre", pymongo.ASCENDING), ("category", pymongo.ASCENDING), ("index", pymongo.ASCENDING), ], unique=True, ) self._chapters.create_index([ ("textId.genre", pymongo.ASCENDING), ("textId.category", pymongo.ASCENDING), ("textId.index", pymongo.ASCENDING), ]) self._chapters.create_index([ ("textId.genre", pymongo.ASCENDING), ("textId.category", pymongo.ASCENDING), ("textId.index", pymongo.ASCENDING), ("order", pymongo.ASCENDING), ]) self._chapters.create_index( [ ("textId.genre", pymongo.ASCENDING), ("textId.category", pymongo.ASCENDING), ("textId.index", pymongo.ASCENDING), ("stage", pymongo.ASCENDING), ("name", pymongo.ASCENDING), ], unique=True, ) def create(self, text: Text) -> None: self._texts.insert_one(TextSchema(exclude=["chapters"]).dump(text)) def create_chapter(self, chapter: Chapter) -> None: self._chapters.insert_one(ChapterSchema().dump(chapter)) def find(self, id_: TextId) -> Text: try: mongo_text = next( self._texts.aggregate([ { "$match": { "genre": id_.genre.value, "category": id_.category, "index": id_.index, } }, *join_reference_documents(), *join_chapters(True), { "$limit": 1 }, ])) return TextSchema().load(mongo_text) except StopIteration as error: raise text_not_found(id_) from error def find_chapter(self, id_: ChapterId) -> Chapter: try: chapter = self._chapters.find_one(chapter_id_query(id_), projection={"_id": False}) return ChapterSchema().load(chapter) except NotFoundError as error: raise chapter_not_found(id_) from error def find_chapter_for_display(self, id_: ChapterId) -> ChapterDisplay: try: text = self.find(id_.text_id) chapters = self._chapters.aggregate(aggregate_chapter_display(id_)) return ChapterDisplaySchema().load({ **next(chapters), "textName": text.name, "textHasDoi": text.has_doi, "isSingleStage": not text.has_multiple_stages, }) except NotFoundError as error: raise text_not_found(id_.text_id) from error except StopIteration as error: raise chapter_not_found(id_) from error def find_line(self, id_: ChapterId, number: int) -> Line: try: chapters = self._chapters.aggregate([ { "$match": chapter_id_query(id_) }, { "$unwind": "$lines" }, { "$replaceRoot": { "newRoot": "$lines" } }, { "$skip": number }, ]) return LineSchema().load(next(chapters)) except StopIteration as error: raise line_not_found(id_, number) from error def list(self) -> List[Text]: return TextSchema().load( self._texts.aggregate([ *join_reference_documents(), *join_chapters(False), { "$sort": { "category": pymongo.ASCENDING, "index": pymongo.ASCENDING, } }, ]), many=True, ) def update(self, id_: ChapterId, chapter: Chapter) -> None: self._chapters.update_one( chapter_id_query(id_), { "$set": ChapterSchema(only=[ "manuscripts", "uncertain_fragments", "lines", "signs", "parser_version", ]).dump(chapter) }, ) def query_by_transliteration(self, query: TransliterationQuery) -> List[Chapter]: return ChapterSchema().load( self._chapters.find_many( {"signs": { "$regex": query.regexp }}, projection={"_id": False}, limit=100, ), many=True, ) def query_manuscripts_by_chapter(self, id_: ChapterId) -> List[Manuscript]: try: return ManuscriptSchema().load( self._chapters.find_one(chapter_id_query(id_), projection={"manuscripts": True})["manuscripts"], many=True, ) except NotFoundError as error: raise chapter_not_found(id_) from error def query_manuscripts_with_joins_by_chapter( self, id_: ChapterId) -> List[Manuscript]: try: return ManuscriptSchema().load( self._chapters.aggregate([ { "$match": chapter_id_query(id_) }, { "$project": { "manuscripts": True } }, { "$unwind": "$manuscripts" }, { "$replaceRoot": { "newRoot": "$manuscripts" } }, *join_joins(), *is_in_fragmentarium("museumNumber", "isInFragmentarium"), ]), many=True, ) except NotFoundError as error: raise chapter_not_found(id_) from error
def __init__(self, database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._joins = MongoCollection(database, JOINS_COLLECTION)
class MongoFragmentRepository(FragmentRepository): def __init__(self, database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._joins = MongoCollection(database, JOINS_COLLECTION) def create_indexes(self) -> None: self._fragments.create_index( [ ("museumNumber.prefix", pymongo.ASCENDING), ("museumNumber.number", pymongo.ASCENDING), ("museumNumber.suffix", pymongo.ASCENDING), ], unique=True, ) self._fragments.create_index([("accession", pymongo.ASCENDING)]) self._fragments.create_index([("cdliNumber", pymongo.ASCENDING)]) self._fragments.create_index([("folios.name", pymongo.ASCENDING)]) self._fragments.create_index( [ ("text.lines.content.value", pymongo.ASCENDING), ("text.lines.content.uniqueLemma.0", pymongo.ASCENDING), ] ) self._fragments.create_index([("text.lines.type", pymongo.ASCENDING)]) self._fragments.create_index([("record.type", pymongo.ASCENDING)]) self._fragments.create_index( [ ("publication", pymongo.ASCENDING), ("joins", pymongo.ASCENDING), ("collection", pymongo.ASCENDING), ] ) self._joins.create_index( [ ("fragments.museumNumber.prefix", pymongo.ASCENDING), ("fragments.museumNumber.number", pymongo.ASCENDING), ("fragments.museumNumber.suffix", pymongo.ASCENDING), ] ) def count_transliterated_fragments(self): return self._fragments.count_documents(HAS_TRANSLITERATION) def count_lines(self): result = self._fragments.aggregate( [{"$group": {"_id": None, "total": {"$sum": "$text.numberOfLines"}}}] ) try: return next(result)["total"] except StopIteration: return 0 def create(self, fragment): return self._fragments.insert_one( { "_id": str(fragment.number), **FragmentSchema(exclude=["joins"]).dump(fragment), } ) def create_join(self, joins: Sequence[Sequence[Join]]) -> None: self._joins.insert_one( { "fragments": [ { **JoinSchema(exclude=["is_in_fragmentarium"]).dump(join), "group": index, } for index, group in enumerate(joins) for join in group ] } ) def query_by_museum_number(self, number: MuseumNumber): data = self._fragments.aggregate( [ {"$match": museum_number_is(number)}, *join_reference_documents(), *join_joins(), ] ) try: fragment_data = next(data) return FragmentSchema(unknown=EXCLUDE).load(fragment_data) except StopIteration as error: raise NotFoundError(f"Fragment {number} not found.") from error def query_by_id_and_page_in_references(self, id_: str, pages: str): match: dict = {"references": {"$elemMatch": {"id": id_}}} if pages: match["references"]["$elemMatch"]["pages"] = { "$regex": rf".*?(^|[^\d]){pages}([^\d]|$).*?" } cursor = self._fragments.find_many(match, projection={"joins": False}) return self._map_fragments(cursor) def query_by_fragment_cdli_or_accession_number(self, number): cursor = self._fragments.find_many( number_is(number), projection={"joins": False} ) return self._map_fragments(cursor) def query_random_by_transliterated(self): cursor = self._fragments.aggregate( [*aggregate_random(), {"$project": {"joins": False}}] ) return self._map_fragments(cursor) def query_path_of_the_pioneers(self): cursor = self._fragments.aggregate( [*aggregate_path_of_the_pioneers(), {"$project": {"joins": False}}] ) return self._map_fragments(cursor) def query_transliterated_numbers(self): cursor = self._fragments.find_many( HAS_TRANSLITERATION, projection=["museumNumber"] ).sort("_id", pymongo.ASCENDING) return MuseumNumberSchema(many=True).load( fragment["museumNumber"] for fragment in cursor ) def query_transliterated_line_to_vec(self) -> List[LineToVecEntry]: cursor = self._fragments.find_many(HAS_TRANSLITERATION, {"text": False}) return [ LineToVecEntry( MuseumNumberSchema().load(fragment["museumNumber"]), fragment["script"], tuple( LineToVecEncoding.from_list(line_to_vec) for line_to_vec in fragment["lineToVec"] ), ) for fragment in cursor ] def query_by_transliterated_sorted_by_date(self): cursor = self._fragments.aggregate( [*aggregate_latest(), {"$project": {"joins": False}}] ) return self._map_fragments(cursor) def query_by_transliterated_not_revised_by_other(self): cursor = self._fragments.aggregate( [*aggregate_needs_revision(), {"$project": {"joins": False}}], allowDiskUse=True, ) return FragmentInfoSchema(many=True).load(cursor) def query_by_transliteration(self, query): cursor = self._fragments.find_many( {"signs": {"$regex": query.regexp}}, limit=100, projection={"joins": False} ) return self._map_fragments(cursor) def update_transliteration(self, fragment): self._fragments.update_one( fragment_is(fragment), { "$set": FragmentSchema( only=("text", "notes", "signs", "record", "line_to_vec") ).dump(fragment) }, ) def update_genres(self, fragment): self._fragments.update_one( fragment_is(fragment), {"$set": FragmentSchema(only=("genres",)).dump(fragment)}, ) def update_lemmatization(self, fragment): self._fragments.update_one( fragment_is(fragment), {"$set": FragmentSchema(only=("text",)).dump(fragment)}, ) def query_next_and_previous_folio(self, folio_name, folio_number, number): sort_ascending = {"$sort": {"key": 1}} sort_descending = {"$sort": {"key": -1}} def create_pipeline(*parts): return [ {"$match": {"folios.name": folio_name}}, {"$unwind": "$folios"}, { "$project": { "name": "$folios.name", "number": "$folios.number", "key": {"$concat": ["$folios.number", "-", "$_id"]}, } }, {"$match": {"name": folio_name}}, *parts, {"$limit": 1}, ] def get_numbers(pipeline): cursor = self._fragments.aggregate(pipeline) try: entry = next(cursor) return {"fragmentNumber": entry["_id"], "folioNumber": entry["number"]} except StopIteration: return None first = create_pipeline(sort_ascending) previous = create_pipeline( {"$match": {"key": {"$lt": f"{folio_number}-{number}"}}}, sort_descending ) next_ = create_pipeline( {"$match": {"key": {"$gt": f"{folio_number}-{number}"}}}, sort_ascending ) last = create_pipeline(sort_descending) result = { "previous": get_numbers(previous) or get_numbers(last), "next": get_numbers(next_) or get_numbers(first), } if has_none_values(result): raise NotFoundError("Could not retrieve any fragments") else: return result def query_museum_numbers(self, prefix: str, number_regex: str) -> Sequence[dict]: return self._fragments.find_many( { "museumNumber.prefix": prefix, "museumNumber.number": {"$regex": number_regex}, }, projection={"museumNumber": True}, ) def _query_next_and_previous_fragment( self, museum_number ) -> Tuple[Optional[MuseumNumber], Optional[MuseumNumber]]: same_museum_numbers = self.query_museum_numbers( museum_number.prefix, rf"{museum_number.number}[^\d]*" ) preeceding_museum_numbers = self.query_museum_numbers( museum_number.prefix, rf"{int(museum_number.number) - 1}[^\d]*" ) following_museum_numbers = self.query_museum_numbers( museum_number.prefix, rf"{int(museum_number.number) + 1}[^\d]*" ) return _find_adjacent_museum_number_from_sequence( museum_number, [ *same_museum_numbers, *preeceding_museum_numbers, *following_museum_numbers, ], ) def query_next_and_previous_fragment( self, museum_number: MuseumNumber ) -> FragmentPagerInfo: if museum_number.number.isnumeric(): prev, next = self._query_next_and_previous_fragment(museum_number) if prev and next: return FragmentPagerInfo(prev, next) museum_numbers_by_prefix = self._fragments.find_many( {"museumNumber.prefix": museum_number.prefix}, projection={"museumNumber": True}, ) prev, next = _find_adjacent_museum_number_from_sequence( museum_number, museum_numbers_by_prefix ) if not (prev and next): all_museum_numbers = self._fragments.find_many( {}, projection={"museumNumber": True} ) prev, next = _find_adjacent_museum_number_from_sequence( museum_number, all_museum_numbers, True ) return FragmentPagerInfo(cast(MuseumNumber, prev), cast(MuseumNumber, next)) def update_references(self, fragment): self._fragments.update_one( fragment_is(fragment), {"$set": FragmentSchema(only=("references",)).dump(fragment)}, ) def _map_fragments(self, cursor): return FragmentSchema(unknown=EXCLUDE, many=True).load(cursor)
def __init__(self, database): self._collection = MongoCollection(database, "changelog")
def __init__(self, database: Database) -> None: self._collection = MongoCollection(database, COLLECTION)
class MongoSignRepository(SignRepository): def __init__(self, database: Database): self._collection = MongoCollection(database, COLLECTION) def create(self, sign: Sign) -> str: return self._collection.insert_one(SignSchema().dump(sign)) def find(self, name: SignName) -> Sign: data = self._collection.find_one_by_id(name) return cast(Sign, SignSchema(unknown=EXCLUDE).load(data)) def search(self, reading: str, sub_index: Optional[int] = None) -> Optional[Sign]: sub_index_query = {"$exists": False} if sub_index is None else sub_index try: data = self._collection.find_one( { "values": { "$elemMatch": {"value": reading, "subIndex": sub_index_query} } } ) return cast(Sign, SignSchema(unknown=EXCLUDE).load(data)) except NotFoundError: return None def search_by_id(self, query: str) -> Sequence[Sign]: cursor = self._collection.aggregate( [{"$match": {"_id": {"$regex": re.escape(query), "$options": "i"}}}] ) return SignSchema().load(cursor, unknown=EXCLUDE, many=True) def search_all(self, reading: str, sub_index: int) -> Sequence[Sign]: cursor = self._collection.find_many( {"values": {"$elemMatch": {"value": reading, "subIndex": sub_index}}} ) return SignSchema().load(cursor, unknown=EXCLUDE, many=True) def search_by_lists_name(self, name: str, number: str) -> Sequence[Sign]: cursor = self._collection.find_many( {"lists": {"$elemMatch": {"name": name, "number": number}}} ) return SignSchema().load(cursor, unknown=EXCLUDE, many=True) def search_include_homophones(self, reading: str) -> Sequence[Sign]: cursor = self._collection.aggregate( [ {"$match": {"values.value": reading}}, {"$unwind": "$values"}, { "$addFields": { "subIndexCopy": { "$cond": [ {"$eq": ["$values.value", reading]}, {"$ifNull": ["$values.subIndex", float("inf")]}, float("inf"), ] } } }, { "$group": { "_id": "$_id", "lists": {"$first": "$lists"}, "unicode": {"$first": "$unicode"}, "mesZl": {"$first": "$mesZl"}, "LaBaSi": {"$first": "$LaBaSi"}, "Logograms": {"$push": "$Logograms"}, "Fossey": {"$push": "$Fossey"}, "values": {"$push": "$values"}, "subIndexCopy": {"$min": "$subIndexCopy"}, } }, {"$sort": {"subIndexCopy": 1}}, ] ) return SignSchema().load(cursor, unknown=EXCLUDE, many=True) def search_composite_signs(self, reading: str, sub_index: int) -> Sequence[Sign]: cursor = self._collection.aggregate( [ { "$match": { "values": { "$elemMatch": {"value": reading, "subIndex": sub_index} } } }, { "$lookup": { "from": "signs", "let": {"leftId": "$_id"}, "pipeline": [ { "$match": { "$expr": { "$regexMatch": { "input": "$_id", "regex": { "$concat": [ r".*(^|[\.\+×&%@x|\(\)])", { "$trim": { "input": "$$leftId", "chars": "|", } }, r"($|[\.\+×&%@x|\(\)])", ] }, } } } } ], "as": "joined", } }, {"$unwind": "$joined"}, {"$replaceRoot": {"newRoot": "$joined"}}, ] ) return SignSchema().load(cursor, unknown=EXCLUDE, many=True)
class MongoBibliographyRepository(BibliographyRepository): def __init__(self, database): self._collection = MongoCollection(database, COLLECTION) def create(self, entry) -> str: mongo_entry = create_mongo_entry(entry) return self._collection.insert_one(mongo_entry) def query_by_id(self, id_: str) -> dict: data = self._collection.find_one_by_id(id_) return create_object_entry(data) def update(self, entry) -> None: mongo_entry = create_mongo_entry(entry) self._collection.replace_one(mongo_entry) def query_by_author_year_and_title(self, author: Optional[str], year: Optional[int], title: Optional[str]) -> Sequence[dict]: match: Dict[str, Any] = {} def pad_trailing_zeroes(year: int) -> int: padded_year = str(year).ljust(4, "0") return int(padded_year) if author: match["author.0.family"] = author if year: match["issued.date-parts.0.0"] = { "$gte": pad_trailing_zeroes(year), "$lt": pad_trailing_zeroes(year + 1), } if title: match["$expr"] = { "$eq": [{ "$substrCP": ["$title", 0, len(title)] }, title] } return [ create_object_entry(data) for data in self._collection.aggregate( [ { "$match": match }, { "$addFields": { "primaryYear": { "$arrayElemAt": [ { "$arrayElemAt": ["$issued.date-parts", 0] }, 0, ] } } }, { "$sort": { "author.0.family": 1, "primaryYear": 1, "title": 1 } }, { "$project": { "primaryYear": 0 } }, ], collation={ "locale": "en", "strength": 1, "normalization": True }, ) ] def query_by_container_title_and_collection_number( self, container_title_short: Optional[str], collection_number: Optional[str]) -> Sequence[dict]: match: Dict[str, Any] = {} if container_title_short: match["container-title-short"] = container_title_short if collection_number: match["collection-number"] = collection_number return [ create_object_entry(data) for data in self._collection.aggregate( [ { "$match": match }, { "$addFields": { "primaryYear": { "$arrayElemAt": [ { "$arrayElemAt": ["$issued.date-parts", 0] }, 0, ] } } }, { "$sort": { "author.0.family": 1, "primaryYear": 1, "collection-title": 1, } }, { "$project": { "primaryYear": 0 } }, ], collation={ "locale": "en", "strength": 1, "normalization": True }, ) ]
def __init__(self, database: Database): self._fragments = MongoCollection(database, FRAGMENTS_COLLECTION) self._chapters = MongoCollection(database, CHAPTERS_COLLECTION)
def __init__(self, database): self._collection = MongoCollection(database, COLLECTION) self._changelog = Changelog(database)
def collection(database): return MongoCollection(database, "collection")