data_test = tfv.transform(data_test) # Feature Selection Tfidf chi = SelectKBest(chi2,k=k_best_nb) data_train = chi.fit_transform(data_train,labels_train) data_test = chi.transform(data_test) # Feature Selection CountVectorizer chi = SelectKBest(chi2,k=k_best_nb) chi.fit_transform(count_matrix,labels_train) count_matrix = chi.transform(count_matrix) count_test = chi.transform(count_test) # Nbmatrix nbmat = NBmatrix(1.0,bina=True,n_jobs=1) nbmat.fit(count_matrix,labels_train) nbm_test = nbmat.transform(count_test) nbm_data = nbmat.transform(count_matrix) ########################### Train part ######################################## # First Layer Models for TF-IDF proba1, basic_score1, basic_name1 = first_layer(basic_model1, data_train, labels_train,data_train,labels_train) # First Layer Models for New features proba2,basic_score2, basic_name2 = first_layer(basic_model2, new_mat_train, labels_train,new_mat_train,labels_train) # First Layer Nbmatrix proba3, basic_score3, basic_name3 = first_layer(basic_model3, nbm_data, labels_train,nbm_data,labels_train) # Grouping the first layer probas proba = np.hstack([proba1,proba2,proba3])
#X= X_T #Remove html tags train = ct.removehtml(data) #Create the dictionnary (WARNING nltk should be up-to-date) data=ct.stemTokenize(train) #Compute tf-idf including n_grams of size 2 tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), binary=False) #Compute a count_vectorizer including n_grams of size 2 count_vectorizer = CountVectorizer(ngram_range=(1,2),binary=False) #Comptute a NB matrix as describe by Wang & Manning nb_vectorizer = NBmatrix(alpha = 1.0 ,bina = True, n_jobs = 1) #Fit transform on the data tfidf_matrix = tfidf_vectorizer.fit_transform(data) count_matrix = count_vectorizer.fit_transform(data) nb_matrix = nb_vectorizer.fit_transform(count_matrix,labels) print "size of the matrix : ", tfidf_matrix.shape average_nb_words = np.mean(count_matrix.sum(axis=1)) print "Average number of words per review : ", average_nb_words dic_size = count_matrix.shape[1] print "dictionnary size : " , dic_size sparsity = 1-float(count_matrix.nnz)/(25000.0*dic_size) print "Sparsity of the data : ", sparsity