Example #1
0
def test_insert_document_and_check_conflict():
    """
    Run the start.py file and insert a record into in_progress that has a conflict with the prod
    collection so main method should return the conflict id. Also check if the newly inserted
    document is in prod collection and with the correct manually_changed. Lastly fetch the
    manual document instead of the document in prod collection.
    """
    id = "295cc564fe771fbb92b3278a6eee2d5cbcae2606-3"
    correct_conflicts = [{
        "conflict_id": id,
        "title": " Velkommen til Trondheim kommune"
    }]

    f = open("model/test/test_data/test_data_in_progress.json")
    serialized_data = json.load(f)

    # Just to make sure the new document has changed we add a number number here.
    random_text = "Inserted_document_website_change: " + str(
        random.randint(1, 100000))
    serialized_data[0]["content"]["texts"][0] = random_text

    # Check if the document was a conflict.
    conflict_ids = insert_documents(serialized_data)
    assert correct_conflicts[0]["conflict_id"] == conflict_ids[0][
        "conflict_id"]
    assert correct_conflicts[0]["title"] == conflict_ids[0]["title"]

    # Fetch the document from the prod collection.
    factory = ModelFactory.get_instance()
    util.set_db(factory)
    document = next(
        factory.get_database().get_collection("prod").find({"id": id}), None)
    assert document["content"]["texts"][0] == random_text

    # Get the manually changed document.
    manually_changed_doc = util.check_manually_changed(factory, document)
    # Check if we actually got the manually changed document.
    assert manually_changed_doc["content"]["texts"][0] == "El manual changos"
Example #2
0
def insert_documents(data):
    """
    :param data: Is a list of serialized documents that should be inserted.
    :return: a list of conflict document ids.
    """
    factory = ModelFactory.get_instance()

    util.set_db(factory)
    """
    How we use MongoDB:
    We have 3 different collections:
        One for manual entries called "manual"
        One for production called "prod"
        One for the in_progress collection called "in_progress"

    After we have scraped we add all the scraped data into the collection "in_progress" and then
    we go through every entry in the "manual" collection and use that entry's ID to query both
    prod and in_progress collection. We compare the two contents in prod and in_progress to see
    if something changed from last time this was run and now. If they do not have the same
    content then we need to alert someone that the manual entry needs to be updated.

    When this is done in_progress will become our new prod.
    """

    factory.get_database().drop_collection("in_progress")

    print('Starting insertion of {} documents'.format(len(data)))
    pbar = ProgressBar()
    for i, doc in enumerate(pbar(data)):
        factory.post_document(doc, "in_progress")
    print('Successfully inserted {} documents'.format(i + 1))

    manual_documents = factory.get_collection("manual").find()

    # These are the IDs of the documents that are changed in manual and have been changed since
    # last time.
    conflict_ids = []
    for manual_document in manual_documents:
        if "id" in manual_document:
            id = manual_document["id"]
        else:
            continue

        factory.get_database().get_collection("in_progress").update(
            {"id": id}, {"$set": {
                "manually_changed": True
            }})

        prod_match = factory.get_collection("prod").find({"id": id})
        in_progress_match = factory.get_collection("in_progress").find(
            {"id": id})

        prod_match_doc = next(prod_match, None)
        in_progress_doc = next(in_progress_match, None)

        if prod_match_doc and in_progress_doc:
            if prod_match_doc['content'] != in_progress_doc['content']:
                conflict_ids.append({
                    "conflict_id":
                    id,
                    "title":
                    in_progress_doc["content"]["title"]
                })

    print("Conflict IDs are", conflict_ids)
    # Set ID to be unique.
    factory.get_collection("conflict_ids").create_index([("conflict_id", 1)],
                                                        unique=True)
    # Insert all the conflict ids into our collection.
    for conflict in conflict_ids:
        try:
            factory.post_document(conflict, "conflict_ids")
        except pymongo.errors.DuplicateKeyError:
            # Then we already know this is a conflict ID and should not be added again to the list.
            pass

    # Delete the backup prod and rename prod to prod2 and then rename in_progress to prod.
    factory.get_database().drop_collection("prod2")
    try:
        factory.get_database().get_collection("prod").rename("prod2")
    except pymongo.errors.OperationFailure:
        pass
    factory.get_database().get_collection("in_progress").rename("prod")

    util.set_index("in_progress", factory)
    util.set_index("prod", factory)
    util.set_index("manual", factory)
    # Set query_text to be unique.
    factory.get_collection("unknown_queries").create_index([("query_text", 1)],
                                                           unique=True)

    return conflict_ids
Example #3
0
import model.db_util as db_util
import api.flask.flask_util as flask_util
from model.ModelFactory import ModelFactory
from model.keyword_gen import lemmatize_content_keywords
import json
from flask import request, Blueprint

web_api = Blueprint('Website API', __name__, template_folder='templates')

factory = ModelFactory.get_instance()

db_util.set_db(factory)


@web_api.route("/v1/web/conflict_ids", methods=["GET"])
def get_all_conflict_ids():
    """
    :return: a list of {"title" "...", "id": "..."}
    """
    conflict_ids_docs = factory.get_collection("conflict_ids").find()
    conflict_ids = []
    for conflict_id_doc in conflict_ids_docs:
        conflict_ids.append({"id": conflict_id_doc["conflict_id"],
                             "title": conflict_id_doc["title"]})
    return json.dumps(conflict_ids)


@web_api.route("/v1/web/content/", methods=["GET"])
def get_content():
    """
    :return: the content of the prod document and manual document (if we have it)