Beispiel #1
0
 def start_loading(self, project):
     cache.load(self.filename)
     self.project_loader = ProjectLoader(project)
     self.project_loader.started.connect(self.started)
     self.project_loader.progress.connect(self.progress)
     self.project_loader.loaded.connect(self.loaded)
     self.project_loader.start(QtCore.QThread.IdlePriority)
def __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id,
                                                n, index_id, index_directory,
                                                posting_lists, trainingdata_handler,
                                                training_data_filters):
    top_dice_neighbours = {}
    for category in posting_lists:
        top_dice_neighbours[category] = {}
        for id_reference_word_group in posting_lists[category]:
            top_dice_neighbours[category][id_reference_word_group] = []
    index = cache.load(index_directory, index_id)
    print("Index id "+ index_id)
    n_categories_left = len(posting_lists)
    for category in posting_lists:
        posting_lists_category = posting_lists[category]
        #TODO remove
        print("Calculate keyword for category "+category)
        n_posts = sum([len(posting_lists_category[ref_group_id]) for ref_group_id in posting_lists_category])
        print("Number of posts for category:" + str(n_posts))
        print("Number of categories left "+ str(n_categories_left))
        n_categories_left = n_categories_left -1
        for id_reference_word_group in posting_lists_category:
            posting_list_reference_word_group = posting_lists_category[id_reference_word_group]
            all_dice_neighbours = __get_all_dice_neighbours(posting_list_reference_word_group, index, trainingdata_handler, training_data_filters)
            top_dice_neighbours[category][id_reference_word_group] = all_dice_neighbours[:n]
    cache.write(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id, top_dice_neighbours)
def get_dice_keywords(keyword_specification):

    training_dataset_spec = keyword_specification["training_dataset"]
    given_reference_words = keyword_specification["given_reference_words"]

    #Try load Keyword from cache else calculate
    keyword_seed_id = keyword_specification["seed_id"]
    raw_keywords_id = keyword_setup_id_generator.get_no_filtered_keywords_id(keyword_seed_id,training_dataset_spec)

    if cache.in_cache(__RAW_KEYWORD_CACHE, raw_keywords_id):
        raw_keywords = cache.load(__RAW_KEYWORD_CACHE,raw_keywords_id)
    else:
        raw_keywords = __calculate_raw_dice_keywords(keyword_specification)
        cache.write(__RAW_KEYWORD_CACHE, raw_keywords_id, raw_keywords)

    raw_reference_words = raw_keywords["raw_reference_words"]
    raw_context_words = raw_keywords["raw_context_words"]

    # Apply filter and clean from scores
    index_directory = training_dataset_spec["index_directory"]
    indices_id = training_dataset_spec["index_id"]
    index_spec = {"index_directory": index_directory, "index_id": indices_id}

    reference_words_filter = keyword_specification["reference_word_filter"]
    new_reference_words = keyword_filters.apply_reference_word_filters(reference_words_filter,given_reference_words,raw_reference_words,index_spec)

    new_reference_words = __remove_dice_coefficients(new_reference_words)
    new_context_words = __remove_dice_coefficients(raw_context_words)

    reference_words = __merge_new_and_given_reference_words(given_reference_words,new_reference_words)


    return {"reference_words":reference_words,"context_words":new_context_words}
def freqent_term_filter(new_reference_words, index_directory, indices_id, frequence_limit):
    for index_id in indices_id:
        index = cache.load(index_directory, index_id)
        n_document_in_index = index["n_documents"]
        new_reference_words = __frequent_term_filtering(new_reference_words, index["index"], n_document_in_index,
                                                        frequence_limit)

    return new_reference_words
def fix_olympic_games_2(keywords_no_filter_id):
    if cache.in_cache(__RAW_KEYWORD_CACHE, keywords_no_filter_id):
        keywords= cache.load(__RAW_KEYWORD_CACHE, keywords_no_filter_id)
        keyword_types = list(keywords.keys())
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_no_filter_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write(__RAW_KEYWORD_CACHE, keywords_no_filter_id, keywords)
    return keywords
def fix_olympic_games_1(keywords_id):
    if cache.in_cache("keywords", keywords_id):
        keywords = cache.load("keywords", keywords_id)
        keyword_types = ["reference_words","context_words"]
        for keyword_type in keyword_types:
            if "olympic games" in keywords[keyword_type]:
                print("Fix olympic game in keyword directory " + keywords_id)
                olympic_keywords = keywords[keyword_type].pop("olympic games")
                keywords[keyword_type]["olympic_games"] = olympic_keywords
            for category in keywords[keyword_type]:
                keywords[keyword_type][category]['0'] = [keyword for keyword in keywords[keyword_type][category]['0'] if not " " in keyword]
        cache.write("keywords", keywords_id, keywords)
    return keywords
def prepare_keywords(experiment_spec,keyword_cache_directory, index_cache_directory):
    keyword_spec = experiment_spec["keywords"]
    #if manual keywords
    keyword_method = keyword_spec["keyword_generate_algorithm"]
    if keyword_method == "manual":
        keywords = specification_handler.get_specification("keyword_setups",keyword_spec["setup_id"])
        keyword_setup_id = keyword_spec["setup_id"]
        keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
        cache.write(keyword_cache_directory,keyword_id,keywords)
        print("Manual keyword now stored in cache")
        return
    keyword_setup_id = keyword_spec["setup_id"]
    keyword_seed_id =keyword_spec["seed_id"]
    keyword_id = keyword_setup_id_generator.get_keyword_setup_id(keyword_setup_id,experiment_spec["training_dataset"])
    if cache.in_cache(keyword_cache_directory, keyword_id):
       print("Keyword stored in cache: "+keyword_id)
       keywords = cache.load(keyword_cache_directory,keyword_id)
       keyword_factory.print_keyword_setup_to_json(keyword_id,keywords)
       # keywords = keyword_factory.check_for_constructed_keyword_setup(keyword_id)
       # print(keywords)
       return

    crete_new_keywords_spec = {}

    training_data_spec = experiment_spec["training_dataset"]
    crete_new_keywords_spec["seed_id"] = keyword_spec["seed_id"]
    crete_new_keywords_spec["training_dataset"] = training_data_spec
    crete_new_keywords_spec["training_dataset"]["index_directory"] = index_cache_directory
    crete_new_keywords_spec["training_dataset"]["index_id"]= get_all_index_indices(training_data_spec)


    given_reference_words = specification_handler.get_specification(__KEYWORD_SEEDS_DIRECTORY_SPECIFICATION, keyword_seed_id)
    given_reference_words = seed_words_to_index_terms(given_reference_words)

    crete_new_keywords_spec["given_reference_words"] = given_reference_words
    crete_new_keywords_spec["keyword_generate_algorithm"] = keyword_spec["keyword_generate_algorithm"]
    crete_new_keywords_spec["parameters"] = keyword_spec["parameters"]
    crete_new_keywords_spec["reference_word_filter"] = keyword_spec["reference_word_filter"]
    keywords = keyword_factory.get_keywords(crete_new_keywords_spec,keyword_id)
    cache.write(keyword_cache_directory,keyword_id, keywords)
def __get_top_n_dice_neighbours(n, posting_lists, index_directory, indices_id, trainingdata_handler, training_data_filters, raw_keyword_id):
    # return a map of top n dice neighbours for each group of reference words

    #init top_dice_neighbours_structure
    top_dice_neighbours = {}
    for category in posting_lists:
        top_dice_neighbours[category] = {}
        for id_reference_word_group in posting_lists[category]:
            top_dice_neighbours[category][id_reference_word_group] = []

    for index_id in indices_id:
        temporary_keyword_cache_id = "temporary_cached_keywords"
        temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id
        if not cache.in_cache(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id):
            __calculate_top_n_dice_neighbours_for_index(temporary_keyword_cache_id,n,
                                                    index_id,index_directory,
                                                    posting_lists, trainingdata_handler,
                                                    training_data_filters)
        print("All keyword for index calculated "+ index_id)

    top_dice_neighbours_by_index = {}
    for index_id in indices_id:
        temporary_keyword_cache_id = "temporary_cached_keywords"
        temporary_keyword_cache_id = temporary_keyword_cache_id + "_"+raw_keyword_id + "_"+index_id
        top_dice_neighbours_by_index[index_id] = cache.load(__RAW_KEYWORD_BY_INDEX_CACHE,temporary_keyword_cache_id)


    for index_id in indices_id:
        present_top_dice_neighbours_by_index = top_dice_neighbours_by_index[index_id]
        for category in present_top_dice_neighbours_by_index:
            for id_reference_word_group in present_top_dice_neighbours_by_index[category]:
                top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group] + present_top_dice_neighbours_by_index[category][id_reference_word_group]

    # sort dice neighbours after dice_coefficients and short to n neighbours
    for category in top_dice_neighbours:
        for id_reference_word_group in top_dice_neighbours[category]:
            top_dice_neighbours[category][id_reference_word_group].sort(key=itemgetter(1), reverse=True)
            top_dice_neighbours[category][id_reference_word_group] = top_dice_neighbours[category][id_reference_word_group][:n]
    return top_dice_neighbours
def get_seed_words_posting_lists(seed_words_spec, index_directory, indices_id):
    # Return a map of postings for each reference_word_group_id
    posting_lists={}
    # set up structure of posting list map
    for category in seed_words_spec:
        posting_lists[category] = {}

    for index_id in indices_id:
        index = cache.load(index_directory, index_id)
        print("merging postining_list for index:")
        print(index["index_type"])
        index = index["index"]
        for category in seed_words_spec:
            reference_word_groups = seed_words_spec[category]
            for id_reference_word_group in reference_word_groups:
                if not id_reference_word_group in posting_lists[category]:
                    posting_lists[category][id_reference_word_group] = []
                for reference_word in reference_word_groups[id_reference_word_group]:
                    reference_term = n_gram_handler.string_to_index_term(reference_word)
                    if reference_term in index:
                        merged_postings = index_operations.get_merged_posting_lists(posting_lists[category][id_reference_word_group], index[reference_term])
                        posting_lists[category][id_reference_word_group] = merged_postings
    return posting_lists