def insert_documents(data): """ Insert all provided documents. Checks if the document has been manually changed before - if it has, and the new document does not match, it is marked as a conflict """ factory = ModelFactory.get_instance() factory.set_db() temp_col = Config.get_mongo_collection("temp_scraped") manual_col = Config.get_mongo_collection("manual") unknown_col = Config.get_mongo_collection("unknown") prod_col = Config.get_mongo_collection("prod") conflict_col = Config.get_mongo_collection("conflicts") print("Starting insertion of {} documents".format(len(data))) pbar = ProgressBar() for i, doc in enumerate(pbar(data)): factory.post_document(doc, temp_col) print("Successfully inserted {} documents".format(i + 1)) manual_docs = factory.get_collection(manual_col).find() conflicts = [] for manual_doc in manual_docs: if "id" in manual_doc: idx = manual_doc["id"] else: continue # Mark corresponding entry in temp collection as manually changed factory.get_database() \ .get_collection(temp_col) \ .update_one({"id": idx}, {"$set": {"manually_changed": True}}) prod_doc = next(factory.get_collection(prod_col).find({"id": idx}), None) temp_doc = next(factory.get_collection(temp_col).find({"id": idx}), None) if prod_doc and temp_doc: if not temp_doc["content"] == prod_doc["content"]: title = temp_doc["content"]["title"] conflicts.append({"id": idx, "title": title}) print("Conflicts: {}".format(conflicts)) factory.get_collection(conflict_col).create_index([("title", 1)], unique=True) for conflict in conflicts: try: factory.post_document(conflict, conflict_col) except pymongo.errors.DuplicateKeyError: # In case there are dupliacte, unsolved conflicts pass # Update production collection db = factory.get_database() try: db.get_collection(prod_col).rename("old_prod") except pymongo.errors.OperationFailure: # If the prod collection does not exist pass try: db.get_collection(temp_col).rename(prod_col) except Exception as e: print("Failed to update production db collection") print(e) db.get_collection("old_prod").rename(prod_col) finally: db.get_collection("old_prod").drop() db.get_collection(temp_col).drop() # Update all indexes factory.set_index(prod_col) factory.set_index(manual_col) factory.set_index(temp_col) # Removes duplicates factory.get_collection(unknown_col).create_index([("query_text", 1)], unique=True) return conflicts
import json import pymongo from chatbot.model.model_factory import ModelFactory fact = ModelFactory.get_instance() fact.set_db() def test_get_document(): path = "chatbot/model/test/test_data/test_data_model_factory.json" with open(path, "r") as f: data = json.load(f) fact.get_database().drop_collection("test") try: fact.post_document(data[0], "test") fact.post_document(data[1], "test") fact.get_collection("test").create_index( [("keywords", pymongo.TEXT), ("content.keywords.keyword", pymongo.TEXT)], default_language="norwegian") # Need to create index for manual incase the collection does not exist # yet fact.set_index("manual") # Test first document doc = fact.get_document("emne test", prod_col="test") assert doc[0]["content"] == data[0]["content"]