コード例 #1
0
def main(input_filepath, output_folder, k):
    """
    Receives the location of the tf-idf scores as a
    command-line Path argument.
    """
    logger = logging.getLogger(__name__)
    logger.info(
        'Training the K-Means clustering algorithm based on the TF-IDF scores')

    # Get the models/tf-idf-scores.csv file
    dataset = pd.read_csv(input_filepath)
    logger.info('Loaded data file ' + input_filepath + ' with ' +
                str(len(dataset)) + ' rows')

    # Removes the first column and formats it like a list
    x = dataset.drop(dataset.columns[0], axis=1).values
    vector_dict = generate_vector_dict(dataset)

    # Number of clusters and max. number of iterations
    km = KMeans(k=k, max_iterations=500)
    km.fit(x)
    clusters = km.get_clusters(vector_dict)

    # Based on the value of K used, change the destination filename
    filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1)
    output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1]

    # Calculate SSE and MSC
    sse_score = km.get_sse_score()
    logger.info('SSE Score: ' + str(sse_score))
    msc_score = km.get_msc_avg()
    logger.info('MSC Score: ' + str(msc_score))

    # Generate the results report
    generate_report(clusters, sse_score, msc_score, output_filepath)
    logger.info('Created report file on ' + output_filepath)

    # Generate / Update the results table for future plots
    if os.path.isfile(output_folder + PLOT_TABLE_FILENAME):
        # Update the existing file
        dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME)
        dataset.set_index('K Size', inplace=True)
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    else:
        # Create and update the file
        dataset = create_plot_results_table()
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME,
                           encoding='utf-8')
    logger.info('Updated report table on ' + output_folder +
                PLOT_TABLE_FILENAME)