Example #1
0
def feature_bag_of_words(
    x_train, x_test=None, list_of_cols=[], keep_col=False, **algo_kwargs
):
    """
    Creates a matrix of how many times a word appears in a document.
    
    Parameters
    ----------
    x_train : DataFrame
        Training dataset, by default None
        
    x_test : DataFrame
        Testing dataset, by default None

    list_of_cols : list, optional
        A list of specific columns to apply this technique to., by default []

    keep_col : bool, optional
        True if you want to keep the columns passed, otherwise remove it.

    algo_kwargs : dict, optional
        Parameters you would pass into Bag of Words constructor as a dictionary., by default {}
    
    Returns
    -------
    Dataframe, *Dataframe
        Transformed dataframe with the new column.

    Returns 2 Dataframes if x_test is provided. 
    """

    enc = CountVectorizer(**algo_kwargs)
    list_of_cols = _get_columns(list_of_cols, x_train)

    for col in list_of_cols:
        enc_data = enc.fit_transform(x_train[col]).toarray()
        enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names())
        x_train = drop_replace_columns(x_train, col, enc_df, keep_col)

        if x_test is not None:
            enc_x_test = enc.transform(x_test[col]).toarray()
            enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_features_names())
            x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col)

    return x_train, x_test
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)

X = count.fit_transform(df['review'].values)

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, random_state=123, learning_method='batch')

X_topics = lda.fit_transform(X)

lda.components_.shape

n_top_words = 5
feature_name = count.get_features_names()
for topic_idx, topic in enumerate(lda.components_):
    print('Topic %d:' % (topic_idx + 1))
    print(' '.join([feature_names[i] for i in topic.argsort() \ [:-n_top_words - 1:-1]]))