def main(input_filepath, output_folder, k): """ Receives the location of the tf-idf scores as a command-line Path argument. """ logger = logging.getLogger(__name__) logger.info( 'Training the K-Means clustering algorithm based on the TF-IDF scores') # Get the models/tf-idf-scores.csv file dataset = pd.read_csv(input_filepath) logger.info('Loaded data file ' + input_filepath + ' with ' + str(len(dataset)) + ' rows') # Removes the first column and formats it like a list x = dataset.drop(dataset.columns[0], axis=1).values vector_dict = generate_vector_dict(dataset) # Number of clusters and max. number of iterations km = KMeans(k=k, max_iterations=500) km.fit(x) clusters = km.get_clusters(vector_dict) # Based on the value of K used, change the destination filename filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1) output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1] # Calculate SSE and MSC sse_score = km.get_sse_score() logger.info('SSE Score: ' + str(sse_score)) msc_score = km.get_msc_avg() logger.info('MSC Score: ' + str(msc_score)) # Generate the results report generate_report(clusters, sse_score, msc_score, output_filepath) logger.info('Created report file on ' + output_filepath) # Generate / Update the results table for future plots if os.path.isfile(output_folder + PLOT_TABLE_FILENAME): # Update the existing file dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME) dataset.set_index('K Size', inplace=True) k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) else: # Create and update the file dataset = create_plot_results_table() k_means_results = update_plot_results_table(dataset, (k, sse_score, msc_score)) k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME, encoding='utf-8') logger.info('Updated report table on ' + output_folder + PLOT_TABLE_FILENAME)