def get_gold_standard_categorization(self): gold_standard_categorization = {} categories = self.__get_all_category_directory_names() for category in categories: category_path = self.__get_category_path(category) all_documents_in_category = dataset_handler.get_names_of_files_in_directory(category_path) category_index_term = n_gram_handler.string_to_index_term(category) gold_standard_categorization[category_index_term] = all_documents_in_category return gold_standard_categorization
def get_seed_words_posting_lists(seed_words_spec, index_directory, indices_id): # Return a map of postings for each reference_word_group_id posting_lists={} # set up structure of posting list map for category in seed_words_spec: posting_lists[category] = {} for index_id in indices_id: index = cache.load(index_directory, index_id) print("merging postining_list for index:") print(index["index_type"]) index = index["index"] for category in seed_words_spec: reference_word_groups = seed_words_spec[category] for id_reference_word_group in reference_word_groups: if not id_reference_word_group in posting_lists[category]: posting_lists[category][id_reference_word_group] = [] for reference_word in reference_word_groups[id_reference_word_group]: reference_term = n_gram_handler.string_to_index_term(reference_word) if reference_term in index: merged_postings = index_operations.get_merged_posting_lists(posting_lists[category][id_reference_word_group], index[reference_term]) posting_lists[category][id_reference_word_group] = merged_postings return posting_lists
def seed_words_to_index_terms(given_seed_words): for category in given_seed_words: for group_id in given_seed_words[category]: given_seed_words[category][group_id] = [n_gram_handler.string_to_index_term(given_word) for given_word in given_seed_words[category][group_id]] return given_seed_words