def main(): ''' Runs cross validation on the input Twitter data. ''' args = parser.parse_args() # Extract the data for LDA and divide into 10 folds dm = DataManager(args.train_path, 'twitter') if settings.DEBUG: print("Loading data...") # Time the process of loading in the data. start = time.perf_counter() # Load the data (possibly from the cache, if it exists) dm.load_data(args.cache_path) # The number of folds is passed in as a command-line arg dm.divide_into_folds(args.num_folds) end = time.perf_counter() if settings.DEBUG: print( f"Preparing the data (loading, dividing into folds) took {end-start:0.4f} seconds." ) # Initialize the best k and best likelihood, along with the list of k values to try best_k = None best_likelihood = -float("inf") # Get the list of topic numbers to try as a command line arg too. possible_k_values = args.topic_numbers # Store the results to the result path. Add the headers if the file doesn't exist yet. if not os.path.exists(args.results_path): fout = open(args.results_path, "w") out_writer = csv.writer(fout) out_writer.writerow([ "Model", "k", "Average Likelihood", "Number of Documents", "Source" ]) else: fout = open(args.results_path, "w") out_writer = csv.writer(fout) # Run cross validation once for each parameter value for k in possible_k_values: if settings.DEBUG: print(f"Trying k={k} components...") # We will create a list of accuracies for each validation set likelihoods = [] for i in range(dm.get_num_folds()): if settings.DEBUG: print(f" Iteration {i+1}/{dm.get_num_folds()}") # Update the validation fold. dm.set_validation(i) # Retrieve the training data and validation set. train, validate = get_data_for_LDA(dm) start = time.perf_counter() # Train the model with the param choice. lda_model = run_LDA_for_CV(train, k) # Compute the resulting accuracy on the validation set. likelihood = lda_model.score(validate) end = time.perf_counter() if settings.DEBUG: print(f" likelihood = {likelihood}") if settings.DEBUG: print(f" Training took {end-start:0.4f} seconds.") likelihoods.append(likelihood) avg_likelihood = sum(likelihoods) / len(likelihoods) out_writer.writerow([ "LDA", k, avg_likelihood, len(dm.get_all_fold_data()), settings.TWITTER_DIR ]) if settings.DEBUG: print(f" avg_likelihood = {avg_likelihood}") if avg_likelihood > best_likelihood: best_likelihood = avg_likelihood best_k = k print( f"Best average likelihood found was {best_likelihood} with parameter value k={best_k}" ) fout.close()