コード例 #1
0
def main():
    '''
    Runs cross validation on the input Twitter data.
    '''
    args = parser.parse_args()

    # Extract the data for LDA and divide into 10 folds
    dm = DataManager(args.train_path, 'twitter')
    if settings.DEBUG: print("Loading data...")

    # Time the process of loading in the data.
    start = time.perf_counter()

    # Load the data (possibly from the cache, if it exists)
    dm.load_data(args.cache_path)
    # The number of folds is passed in as a command-line arg
    dm.divide_into_folds(args.num_folds)
    end = time.perf_counter()
    if settings.DEBUG:
        print(
            f"Preparing the data (loading, dividing into folds) took {end-start:0.4f} seconds."
        )

    # Initialize the best k and best likelihood, along with the list of k values to try
    best_k = None
    best_likelihood = -float("inf")

    # Get the list of topic numbers to try as a command line arg too.
    possible_k_values = args.topic_numbers

    # Store the results to the result path. Add the headers if the file doesn't exist yet.
    if not os.path.exists(args.results_path):
        fout = open(args.results_path, "w")
        out_writer = csv.writer(fout)
        out_writer.writerow([
            "Model", "k", "Average Likelihood", "Number of Documents", "Source"
        ])
    else:
        fout = open(args.results_path, "w")
        out_writer = csv.writer(fout)

    # Run cross validation once for each parameter value
    for k in possible_k_values:

        if settings.DEBUG: print(f"Trying k={k} components...")

        # We will create a list of accuracies for each validation set
        likelihoods = []
        for i in range(dm.get_num_folds()):
            if settings.DEBUG:
                print(f"    Iteration {i+1}/{dm.get_num_folds()}")

            # Update the validation fold.
            dm.set_validation(i)

            # Retrieve the training data and validation set.
            train, validate = get_data_for_LDA(dm)
            start = time.perf_counter()
            # Train the model with the param choice.
            lda_model = run_LDA_for_CV(train, k)
            # Compute the resulting accuracy on the validation set.
            likelihood = lda_model.score(validate)
            end = time.perf_counter()
            if settings.DEBUG: print(f"        likelihood = {likelihood}")
            if settings.DEBUG:
                print(f"        Training took {end-start:0.4f} seconds.")

            likelihoods.append(likelihood)

        avg_likelihood = sum(likelihoods) / len(likelihoods)
        out_writer.writerow([
            "LDA", k, avg_likelihood,
            len(dm.get_all_fold_data()), settings.TWITTER_DIR
        ])
        if settings.DEBUG: print(f"    avg_likelihood = {avg_likelihood}")

        if avg_likelihood > best_likelihood:
            best_likelihood = avg_likelihood
            best_k = k

    print(
        f"Best average likelihood found was {best_likelihood} with parameter value k={best_k}"
    )
    fout.close()