min_df =min_df, max_df =max_df, stop_words='english', analyzer='word', lowercase = lowercase) ##set filterparameter to your vectorizer filter_by=["OnlyEng", "AllLang"] #two options are available count_dialect = True #n_samples = n_samples #as u choose it. stem_vectorizer.setfilter_option(filter_by[0],count_dialect) #df_train = df_train[:75000] #df_train = df_train.sample(frac=0.04) df_train = df_train[:3000] vector_df1, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Category',vectorizer=stem_vectorizer, tf_idf=True) vector_df2, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Developer',vectorizer=stem_vectorizer, tf_idf=True) vector_df3, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Name',vectorizer=stem_vectorizer, tf_idf=True) vector_df4, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'ContentRating',vectorizer=stem_vectorizer, tf_idf=True) vector_df5, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Description',vectorizer=stem_vectorizer, tf_idf=True) #---------------------------------------PCA-----------------------------------------# #dist = pdist(vector_df, 'euclidean') #print dist.shape #dist = squareform(dist) #print dist.shape pca = PCA(n_components=5, copy=True) vector_df2 = pd.DataFrame(pca.fit(vector_df2).transform(vector_df2)) vector_df3 = pd.DataFrame(pca.fit(vector_df3).transform(vector_df3)) vector_df5 = pd.DataFrame(pca.fit(vector_df5).transform(vector_df5))
min_df =min_df, max_df =max_df, stop_words='english', analyzer='word', lowercase = lowercase) ##set filterparameter to your vectorizer filter_by=["OnlyEng", "AllLang"] #two options are available count_dialect = True #n_samples = n_samples #as u choose it. stem_vectorizer.setfilter_option(filter_by[0],count_dialect) #df_train = df_train[:100000] df_train = df_train[:75000] df_train = df_train.sample(frac=0.04) vector_df1, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Category',vectorizer=stem_vectorizer, tf_idf=True) #vector_df2, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Developer',vectorizer=stem_vectorizer, tf_idf=True) #vector_df3, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Name',vectorizer=stem_vectorizer, tf_idf=True) vector_df4, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'ContentRating',vectorizer=stem_vectorizer, tf_idf=True) #vector_df5, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Description',vectorizer=stem_vectorizer, tf_idf=True) #---------------------------------------PCA-----------------------------------------# #dist = pdist(vector_df, 'euclidean') #print dist.shape #dist = squareform(dist) #print dist.shape #pca = PCA(n_components=10, copy=True) #vector_df2_pca = pd.DataFrame(pca.fit(vector_df2).transform(vector_df2)) #vector_df3_pca = pd.DataFrame(pca.fit(vector_df3).transform(vector_df3)) #vector_df5_pca = pd.DataFrame(pca.fit(vector_df5).transform(vector_df5))