def clean_multi():
    """
    Deletes the multilingual data from the db.
    """
    db = get_database()
    db.delete_collection(COLLECTION_PARALLELS_MULTI)
    db_file_collection = db.collection(COLLECTION_FILES)
    db_segments_collection = db.collection(COLLECTION_SEGMENTS)
    files = db_file_collection.all()
    for file in files:
        if "available_lang" in file:
            if len(file['available_lang']) > 0:
                segment_nrs = file['segment_keys']
                for segment_nr in segment_nrs:
                    current_doc = db_segments_collection.get(segment_nr)
                    try:
                        if current_doc:
                            current_doc['parallel_ids_multi'] = []
                            db_segments_collection.update(current_doc)
                    except (KeyError, AttributeError) as e:
                        print(
                            "Could not remove multilingual parallels from segment. Error: ",
                            e)
                    except DocumentInsertError as e:
                        print(
                            f"Could not remove multilingual segment {segment_nr}. Error: ",
                            e)
                file['available_lang'] = []
                db_file_collection.update(file)
Exemple #2
0
def clean_all_lang_db(current_lang):
    print("Cleaning data for language",current_lang)
    db = get_database()

    segments_collection = db.collection(COLLECTION_SEGMENTS)
    segments_collection.delete_match({"lang":current_lang})

    parallels_collection = db.collection(COLLECTION_PARALLELS)
    parallels_collection.delete_match({"src_lang":current_lang})

    parallels_sorted_collection = db.collection(COLLECTION_PARALLELS_SORTED_BY_FILE)
    parallels_sorted_collection.delete_match({"lang":current_lang})

    menu_categories_collection = db.collection(COLLECTION_MENU_CATEGORIES)
    menu_categories_collection.delete_match({"language":current_lang})

    menu_collections_collection = db.collection(COLLECTION_MENU_COLLECTIONS)
    menu_collections_collection.delete_match({"language":current_lang})

    parallels_count_collection = db.collection(COLLECTION_FILES_PARALLEL_COUNT)
    parallels_count_collection.delete_match({"language":current_lang})

    files_collection = db.collection(COLLECTION_FILES)
    files_collection.delete_match({"language":current_lang})
    print("Cleaning data done.")
Exemple #3
0
def clean_totals_collection_db():
    """
    Clear the categories_parallel_count collection
    """
    db = get_database()
    db.delete_collection(COLLECTION_CATEGORIES_PARALLEL_COUNT)
    db.create_collection(COLLECTION_CATEGORIES_PARALLEL_COUNT)
    print("totals collection cleaned.")
Exemple #4
0
def load_menu_files(c):
    print("Loading menu collections...")
    db = get_database()
    load_all_menu_categories(db)
    load_all_menu_collections(db)
    create_collections_categories_graph(db)

    print("Menu data loading completed!")
Exemple #5
0
def clean_search_index_db():
    """
    Clear all the search index views and collections.
    """
    db = get_database()
    try:
        for name in INDEX_COLLECTION_NAMES:
            if db.has_collection(name):
                db.delete_collection(name)
        for name in INDEX_VIEW_NAMES:
            db.delete_view(name)
    except CollectionDeleteError as e:
        print("Error deleting collection %s: " % name, e)
    clean_analyzers(db)
    print("search index cleaned.")
Exemple #6
0
def load_parallel_counts(source_name: str, target_name: str,
                         total_length_count: list):
    if total_length_count:
        db = get_database()
        collection = db.collection(COLLECTION_CATEGORIES_PARALLEL_COUNT)
        doc = {
            "_key": source_name + "_" + target_name,
            "sourcecollection": source_name,
            "targetcollection": target_name,
            "totallengthcount": total_length_count,
        }
        try:
            collection.add_hash_index(["sourcecollection"], unique=False)
            collection.insert(doc)
        except (DocumentInsertError, IndexCreateError) as e:
            print("Could not load file. Error: ", e)
Exemple #7
0
def clean_all_collections_db():
    """
    Clear all the database collections completely.
    """
    db = get_database()
    current_name = ""
    try:
        for name in COLLECTION_NAMES:
            current_name = name
            db.delete_collection(name)
        for name in EDGE_COLLECTION_NAMES:
            current_name = name
            db.delete_collection(name)
        db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES)
    except CollectionDeleteError as e:
        print("Error deleting collection %s: " % current_name, e)
    except GraphDeleteError as e:
        print("couldn't remove graph. It probably doesn't exist.", e)

    print("all collections cleaned.")
def load_multilingual_file(filepath):
    db = get_database()
    db_multi_collection = db.collection(COLLECTION_PARALLELS_MULTI)
    db_segments_collection = db.collection(COLLECTION_SEGMENTS)

    print("Loading", filepath)
    with gzip.open(filepath, 'r') as current_file:
        json_data = json.load(current_file)
        if len(json_data) > 0:
            filename = json_data[0]['root_segnr'][0].split(':')[0]
            tgt_lang = json_data[0]['tgt_lang']
            update_filename(filename, tgt_lang, db)

            for parallel in json_data:
                parallel["_key"] = parallel["id"]
            try:
                db_multi_collection.insert_many(json_data)
            except (DocumentInsertError, IndexCreateError) as e:
                print(f"Could not save multilingual parallels. Error: ", e)
            add_multi_parallels_to_segments(json_data, db_segments_collection)
Exemple #9
0
def clean_pali(c):
    """
    Clear all the pali data from the database.
    :param c: invoke.py context object
    """
    db = get_database()
    current_name = ""
    try:
        for name in COLLECTION_NAMES:
            current_name = name
            db.delete_collection(name)
        for name in EDGE_COLLECTION_NAMES:
            current_name = name
            db.delete_collection(name)
        db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES)
    except CollectionDeleteError as e:
        print("Error deleting collection %s: " % current_name, e)
    except GraphDeleteError as e:
        print("couldn't remove graph. It probably doesn't exist.", e)

    print("all collections cleaned.")
Exemple #10
0
def create_search_index(
    c,
    index_url_skt=DEFAULT_SOURCE_URL + "/search_index_sanskrit.json.gz",
    index_url_pli=DEFAULT_SOURCE_URL + "/search_index_pali.json.gz",
    index_url_tib=DEFAULT_SOURCE_URL + "/search_index_tibetan.json.gz",
    index_url_chn=DEFAULT_SOURCE_URL + "/search_index_chn.json.gz",
):
    """
    Load index data for search index from path defined in .env.
    """
    db = get_database()
    create_analyzers(db)
    collections = INDEX_COLLECTION_NAMES
    for name in collections:
        db.create_collection(name)
    load_search_index_skt(index_url_skt, db)
    load_search_index_pli(index_url_pli, db)
    load_search_index_chn(index_url_chn, db)
    load_search_index_tib(index_url_tib, db)
    create_search_views(db)
    print("Search index data loading completed.")
Exemple #11
0
def clean_segment_collections_db():
    """
    Clear the segment database collections completely.
    """
    db = get_database()

    try:
        db.delete_graph(GRAPH_FILES_SEGMENTS)
        db.delete_graph(GRAPH_FILES_PARALLELS)
        for name in (
            COLLECTION_SEGMENTS,
            COLLECTION_PARALLELS,
            COLLECTION_FILES,
            COLLECTION_FILES_PARALLEL_COUNT,
        ):
            empty_collection(name, db)
    except (GraphDeleteError, CollectionDeleteError):
        print(
            f"couldn't remove graph: {GRAPH_FILES_SEGMENTS}. It probably doesn't exist."
        )
    print("segment collections cleaned.")
Exemple #12
0
def create_collections(
    c, collections=COLLECTION_NAMES, edge_collections=EDGE_COLLECTION_NAMES
):
    """
    Create empty collections in database

    :param c: invoke.py context object
    :param collections: Array of collection names to be created
    :param edge_collections: Array of edge collection names to be created
    """
    db = get_database()
    for name in collections:
        try:
            db.create_collection(name)
        except CollectionCreateError as e:
            print(f"Error creating collection {name}: ", e)
    for name in edge_collections:
        try:
            db.create_collection(name, edge=True)
        except CollectionCreateError as e:
            print("Error creating edge collection: ", e)
    print(f"created {collections} collections")
Exemple #13
0
def load_segments_and_parallels_data_from_menu_file(menu_file_json, lang: str,
                                                    root_url: str) -> None:
    file_url = f"{root_url}{lang}/{menu_file_json['filename']}.json.gz"
    db = get_database()

    if not file_url.endswith("gz"):
        print(f"{file_url} is not a gzip file. Ignoring.")
        return

    [segments,
     parallels] = get_segments_and_parallels_from_gzipped_remote_file(file_url)

    if segments:
        segment_keys, totallengthcount, totalfilelengthcount = load_segments(
            segments, parallels, db)
        load_files_collection(menu_file_json, segment_keys, lang, db)
        load_file_parallel_counts(menu_file_json, totallengthcount,
                                  totalfilelengthcount, db)

    if parallels:
        load_parallels(parallels, db)
        load_parallels_sorted(parallels, db, menu_file_json['filename'])
Exemple #14
0
def clean_menu_collections_db():
    """
    Clear the menu database collections completely.
    """
    db = get_database()
    try:
        db.delete_graph(GRAPH_COLLECTIONS_CATEGORIES)
        for name in (
            COLLECTION_MENU_COLLECTIONS,
            COLLECTION_MENU_CATEGORIES,
            COLLECTION_LANGUAGES,
        ):
            empty_collection(name, db)
        for name in (
            EDGE_COLLECTION_LANGUAGE_HAS_COLLECTIONS,
            EDGE_COLLECTION_COLLECTION_HAS_CATEGORIES,
            EDGE_COLLECTION_CATEGORY_HAS_FILES,
        ):
            empty_collection(name, db, edge=True)
    except (GraphDeleteError, CollectionDeleteError):
        print(
            f"couldn't remove object {GRAPH_COLLECTIONS_CATEGORIES}. It probably doesn't exist."
        )
    print("menu data collections cleaned.")
Exemple #15
0
def add_sources(c):
    db = get_database()
    print("adding source information")
    load_sources(db,DEFAULT_SOURCE_URL)
Exemple #16
0
def add_indices(c):
    db = get_database()
    print("Creating Indices")
    create_indices(db)
    print("Creation of indices done.")
Exemple #17
0
def calculate_parallel_totals():
    # This function goes over all the data and groups it into totals for the visual view
    # This takes some time to run on the full dataset.
    db = get_database()
    collection_query_cursor = db.aql.execute(
        menu_queries.QUERY_CATEGORIES_PER_COLLECTION)
    collections = [doc for doc in collection_query_cursor]

    # for each collection, the totals to each other collection of that same language are calculated
    for col in collections:
        language = col["language"]
        source_collection = col["collection"]
        source_col_dict = {}
        for source_cat in col["categories"]:
            source_col_dict.update(source_cat)

        language_collection_list = get_collection_list_for_language(
            language, collections)

        for target_collection in language_collection_list:
            selected_category_dict = get_categories_for_language_collection(
                target_collection, collections)

            counted_parallels = []
            for category, cat_name in source_col_dict.items():
                all_files_cursor = db.aql.execute(
                    menu_queries.QUERY_FILES_PER_CATEGORY,
                    batch_size=100000,
                    bind_vars={
                        "category": category,
                        "language": language
                    },
                )
                all_files = [doc for doc in all_files_cursor]
                add_category_totals_to_db(
                    all_files,
                    category,
                    target_collection,
                    selected_category_dict,
                    language,
                )

                total_par_list = {}
                for filename in all_files:
                    parallel_count = filename["totallengthcount"]
                    for categoryname in selected_category_dict:
                        if categoryname not in total_par_list.keys():
                            if categoryname not in parallel_count.keys():
                                total_par_list[categoryname] = 0
                            else:
                                total_par_list[categoryname] = parallel_count[
                                    categoryname]
                        elif categoryname in parallel_count.keys():
                            total_par_list[categoryname] += parallel_count[
                                categoryname]

                for key, value in total_par_list.items():
                    counted_parallels.append([
                        cat_name + " (" + category + ")",
                        selected_category_dict[key].rstrip() + "_(" + key +
                        ")",
                        value,
                    ])

            load_parallel_counts(source_collection, target_collection,
                                 counted_parallels)