def __get_all_dice_neighbours(posting_list, index, trainingdata_handler, document_filters): dice_neighbours = {} index_type = index["index_type"] index_map = index["index"] to_index_term = n_gram_handler.get_to_index_term_function(index_type) to_freq_dist = n_gram_handler.get_to_freq_dist_function(index_type) for post in posting_list: document_file_id = str(post[0]) document = trainingdata_handler.get_training_data_file_string(document_file_id) document_terms = preprocessing_filters.apply_filters_to_document(document,document_filters) freq_dist = to_freq_dist(document_terms) documents_index_terms = [to_index_term(t) for t in freq_dist] for index_term in documents_index_terms: if index_term not in dice_neighbours and index_term in index_map: dice_coefficient = __calculate_dice_coefficient(posting_list,index_term,index_map) if dice_coefficient > 0: dice_neighbours[index_term] = dice_coefficient dice_neighbours = list(dice_neighbours.items()) dice_neighbours.sort(key=itemgetter(1), reverse=True) return dice_neighbours
def create_index(index_specification): dataset_id = index_specification["dataset_id"] index_type = index_specification["index_type"] filter_names = index_specification["filters"] index = {} # Save meta info about index index["id"] = get_index_id(index_specification) for key in index_specification: index[key] = index_specification[key] # Create traing data handler and assign index help methods according to index type training_dataset_handler = TrainingDatasetHandler(dataset_id) to_index_term = n_gram_handler.get_to_index_term_function(index_type) to_freq_dist = n_gram_handler.get_to_freq_dist_function(index_type) # create index index["index"] = {} n_documents = 0 for document_data in training_dataset_handler: n_documents += 1 document_id = document_data[0] document = document_data[1] document_terms = preprocessing_filters.apply_filters_to_document(document, filter_names) freq_dist = to_freq_dist(document_terms) for document_term in freq_dist: index_term = to_index_term(document_term) if not index_term in index["index"]: index["index"][index_term] = [] posting = (document_id, freq_dist[document_term]) index["index"][index_term].append(posting) index["n_terms"] = len(index["index"]) index["n_documents"] = n_documents index["max_frequency"] = __get_frequency_of_most_common_term(index["index"]) return index