preprocessor=get_col('title'))) ]) vectorizer.fit(df_reduced.to_dict('records')) df_text_processed = vectorizer.transform(df_reduced.to_dict('records')) vocab = vectorizer.get_feature_names() with timer('Lda and nmf on title / description'): if os.path.exists('desc_top_50_nmf.pkl'): desc_top_50 = np.load('desc_top_50_nmf.pkl') desc_top_50 = desc_top_50[:, :50] print('desc nmf features loaded') else: print('fit desc nmf features . . . ') desc_top_50 = NMF(n_components=50, max_iter=1000).fit_transform(df_text_processed[:, :desc_features]) desc_top_50.dump('desc_top_50_nmf.pkl') if os.path.exists('title_top_50_lda.pkl'): title_top_50 = np.load('title_top_50_lda.pkl') print('title lda features loaded') else: print('fit title lda features . . .') title_top_50 = LDA(n_components=50, learning_method='online').fit_transform(df_text_processed[:, desc_features:]) title_top_50.dump('title_top_50_lda.pkl') with timer('Add title lda and desc nmf to data'): desc_nmf_features = ['desc_nmf_' + str(i) for i in range(1, 51)] title_lda_features = ['title_lda_' + str(i) for i in range(1, 51)] df_reduced_columns_before = df_reduced.columns.tolist() df_reduced = pd.concat([df_reduced, pd.DataFrame(desc_top_50, columns=[desc_nmf_features])], axis=1)