def KNearestNeighbourController(training_paths, training_labels, test_paths=None, test_labels=None, k=1, tf_idf=False): """ Controller function to co-ordinate the K-Nearest neighbour algorithm """ # Returns the text per file and their labels once and for all training_text_dict = readTextFromFilePaths(training_paths) # If this is a validation run, we need to keep one book aside at a time and send # the rest of the them to create the training vectors if not test_paths: # Creating the vectors for the training set. This includes all but one book # which will be used for validation for validation_file in training_text_dict: print("Validation being done on: {0}".format(validation_file)) # Creating the training vectors and validation vectors from the training dictionary vectors, _ = createTrainingVectors(training_text_dict) # Performing tf-idf transform on all the vectors tfidf_transform_dict = tfIdfTransform(vectors) if tf_idf else vectors # Separating the validation vector from the training one validation_vector = tfidf_transform_dict.pop(validation_file) training_vectors = tfidf_transform_dict # Finding the nearest neighbours by passing in a manual k nearest_neighbours = findNearestNeighbour(training_vectors, validation_vector, k) # Finding the genre with the concept of voting by neighbours genre = detectClassOfTestSet(nearest_neighbours, training_labels, k) print( "Known label: {0} Calculated Label: {1}".format(training_labels.get(validation_file, "Unknown"), genre) ) # Else clause gets activated in case the run is a final test run else: # Creating the training vectors from the training dictionary vectors, zero_vector = createTrainingVectors(training_text_dict) # Finding the genre of a test file one file at a time for test_file_path in test_paths: test_file = test_file_path.split("/")[-1] print("\nTest file is : {0}".format(test_file)) # Creating the set of tokens for the test file test_text_dict = readTextFromFilePaths([test_file_path]) # Running k nearest neighbours algorithms genre = runKNearestNeighbours(vectors, test_text_dict, test_file, training_labels, zero_vector, k, tf_idf) print("Known label: {0} Calculated Label: {1}".format(test_labels.get(test_file, "Unknown"), genre)) if tf_idf: vectors.pop(test_file)
def runKNearestNeighbours( training_vectors, tokenized_test_file, test_file, training_labels, zero_vector, k, tf_idf=False ): # Creating the test vector with only those tokens that were a part of the training set training_vectors[test_file] = createTestVectors(tokenized_test_file[test_file], zero_vector) # Performing tf-idf transform on all the vectors tfidf_transform_dict = tfIdfTransform(training_vectors) if tf_idf else training_vectors # Seperating the test vector from the training vectors test_vector = tfidf_transform_dict.pop(test_file) training_vectors = tfidf_transform_dict # Finding the nearest neighbours by passing in a manual k nearest_neighbours = findNearestNeighbour(training_vectors, test_vector, k) # Running k nearest neighbours algorithms genre = detectClassOfTestSet(nearest_neighbours, training_labels, k) return genre