def main(input_file, text, target, output_file=None, num_components=2, plot_output=False):
    # Check 2d for plotting:
    if plot_output:
        assert(num_components == 2)

    # Load Data
    logging.info('Loading data file: ' + input_file)
    review_data = text_clustering_funs.csv_to_lists(input_file)

    # Extract Reviews and Styles
    reviews = [d[text] for d in review_data]
    styles = [d[target] for d in review_data]

    # Perform text normalization
    logging.info('Starting Analysis') # All the user really needs to know.
    logging.debug('Performing Text Normalization') # But we need to know where we are and how long we are taking
    reviews = text_clustering_funs.normalize(reviews, ['punctuation', 'numbers', 'stopwords', 'whitespace', 'lower'])

    # Create text features from reviews (straight from scikit documentation here)
    logging.debug('Creating Text Features')
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(reviews)

    # Transform data by SVD, store first X components
    logging.debug('Performing SVD and returning ' + str(num_components) + ' components.')
    svd_algo = TruncatedSVD(n_components=num_components, random_state=42)
    X_transformed = svd_algo.fit_transform(X_train_counts)

    # Plot average for each group
    if plot_output:
        logging.info('Plotting data')
        fontP = FontProperties()
        fontP.set_size('small')
        unique_styles = list(set(styles))
        colors = cm.rainbow(np.linspace(0, 1, len(unique_styles)))
        for i, style in enumerate(unique_styles):
            points = [val for ix, val in enumerate(X_transformed) if styles[ix] == style]
            avg_x = np.mean([p[0] for p in points])
            avg_y = np.mean([p[1] for p in points])
            col = colors[i]
            plt.plot(avg_x, avg_y, color=col, marker='o', ls='', label=style, markersize=20)
            plt.text(avg_x, avg_y, s=style, size='x-small')
            # And because 'old ale' is an outlier...
            plt.ylim([-0.6,-0.1])
            plt.xlim([0.6,1.3])
        plt.show()

    # Output Results:
    if output_file:
        logging.info('Saving results to output: ' + output_file)
        np.savetxt(output_file, X_transformed, delimiter=',')
Example #2
0
def test_csv_to_lists(): # This is kind-of a regression test...
    data = text_clustering_funs.csv_to_lists('beer_reviews.csv')
    assert(len(data) > 0)
Example #3
0
def test_csv_to_lists():  # This is kind-of a regression test...
    data = text_clustering_funs.csv_to_lists('beer_reviews.csv')
    assert (len(data) > 0)