min_df =min_df,
                                         max_df =max_df,
                                         stop_words='english',
                                         analyzer='word',
                                         lowercase = lowercase)
##set filterparameter to your vectorizer
filter_by=["OnlyEng", "AllLang"] #two options are available
count_dialect = True 
#n_samples = n_samples #as u choose it.
stem_vectorizer.setfilter_option(filter_by[0],count_dialect)

#df_train = df_train[:75000]
#df_train = df_train.sample(frac=0.04)
df_train = df_train[:3000]

vector_df1, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Category',vectorizer=stem_vectorizer, tf_idf=True)
vector_df2, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Developer',vectorizer=stem_vectorizer, tf_idf=True)
vector_df3, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Name',vectorizer=stem_vectorizer, tf_idf=True)
vector_df4, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'ContentRating',vectorizer=stem_vectorizer, tf_idf=True)
vector_df5, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Description',vectorizer=stem_vectorizer, tf_idf=True)

#---------------------------------------PCA-----------------------------------------#
#dist = pdist(vector_df, 'euclidean')
#print dist.shape
#dist = squareform(dist)
#print dist.shape

pca = PCA(n_components=5, copy=True)
vector_df2 = pd.DataFrame(pca.fit(vector_df2).transform(vector_df2))
vector_df3 = pd.DataFrame(pca.fit(vector_df3).transform(vector_df3))
vector_df5 = pd.DataFrame(pca.fit(vector_df5).transform(vector_df5))
                                         min_df =min_df,
                                         max_df =max_df,
                                         stop_words='english',
                                         analyzer='word',
                                         lowercase = lowercase)
##set filterparameter to your vectorizer
filter_by=["OnlyEng", "AllLang"] #two options are available
count_dialect = True 
#n_samples = n_samples #as u choose it.
stem_vectorizer.setfilter_option(filter_by[0],count_dialect)

#df_train = df_train[:100000]
df_train = df_train[:75000]
df_train = df_train.sample(frac=0.04)

vector_df1, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Category',vectorizer=stem_vectorizer, tf_idf=True)
#vector_df2, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Developer',vectorizer=stem_vectorizer, tf_idf=True)
#vector_df3, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Name',vectorizer=stem_vectorizer, tf_idf=True)
vector_df4, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'ContentRating',vectorizer=stem_vectorizer, tf_idf=True)
#vector_df5, fmatrix, column_vectorizer = avm.vectorize_columnTfIdf(df_train, 'Description',vectorizer=stem_vectorizer, tf_idf=True)

#---------------------------------------PCA-----------------------------------------#
#dist = pdist(vector_df, 'euclidean')
#print dist.shape
#dist = squareform(dist)
#print dist.shape

#pca = PCA(n_components=10, copy=True)
#vector_df2_pca = pd.DataFrame(pca.fit(vector_df2).transform(vector_df2))
#vector_df3_pca = pd.DataFrame(pca.fit(vector_df3).transform(vector_df3))
#vector_df5_pca = pd.DataFrame(pca.fit(vector_df5).transform(vector_df5))