def feature_bag_of_words( x_train, x_test=None, list_of_cols=[], keep_col=False, **algo_kwargs ): """ Creates a matrix of how many times a word appears in a document. Parameters ---------- x_train : DataFrame Training dataset, by default None x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional A list of specific columns to apply this technique to., by default [] keep_col : bool, optional True if you want to keep the columns passed, otherwise remove it. algo_kwargs : dict, optional Parameters you would pass into Bag of Words constructor as a dictionary., by default {} Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ enc = CountVectorizer(**algo_kwargs) list_of_cols = _get_columns(list_of_cols, x_train) for col in list_of_cols: enc_data = enc.fit_transform(x_train[col]).toarray() enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names()) x_train = drop_replace_columns(x_train, col, enc_df, keep_col) if x_test is not None: enc_x_test = enc.transform(x_test[col]).toarray() enc_test_df = pd.DataFrame(enc_x_test, columns=enc.get_features_names()) x_test = drop_replace_columns(x_test, col, enc_test_df, keep_col) return x_train, x_test
import pandas as pd df = pd.read_csv('movie_data.csv', encoding='utf-8') from sklearn.feature_extraction.text import CountVectorizer count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000) X = count.fit_transform(df['review'].values) from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_topics=10, random_state=123, learning_method='batch') X_topics = lda.fit_transform(X) lda.components_.shape n_top_words = 5 feature_name = count.get_features_names() for topic_idx, topic in enumerate(lda.components_): print('Topic %d:' % (topic_idx + 1)) print(' '.join([feature_names[i] for i in topic.argsort() \ [:-n_top_words - 1:-1]]))