def build_dict_feature_imdb(double_features): sentences_train = [] for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_pos, '*.txt')), desc="train pos"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_train.append(f.readline().strip()) for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_neg, '*.txt')), desc="train neg"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_train.append(f.readline().strip()) sentences_test = [] for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_pos, '*.txt')), desc="test pos"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_test.append(f.readline().strip()) for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_neg, '*.txt')), desc="test neg"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_test.append(f.readline().strip()) if model == "svm": X_train, vectorizer_fitted = build_dic_svm(sentences_train, double_features) X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer_fitted) n = X_train.shape[0] / 2 y_train = [1] * n + [0] * n y_test = [1] * n + [0] * n elif model == "cnn" or model == "lstm": X_train, tokenizer = build_dic_nn(sentences=sentences_train, double_features=double_features) X_test, _ = build_dic_nn(sentences=sentences_test, double_features=double_features, tokenizer=tokenizer) n = len(X_train) / 2 y_train = [1] * n + [0] * n y_test = [1] * n + [0] * n if feature_selection: print("Doing feature selection") if hashing_trick: fselect = SelectKBest(chi2, k=200000) else: if negation: fselect = SelectKBest(chi2, k=200000) else: fselect = SelectKBest(chi2, k=200000) X_train = fselect.fit_transform(X_train, y_train) X_test = fselect.transform(X_test) return X_train, X_test, y_train, y_test
def apply_feature_selection(X_train, y_train, X_test, features): if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest': clf = RandomForestClassifier() clf = clf.fit(X_train.toarray(), y_train) features_scores = [(feature, score) for (score, feature) in sorted( zip(clf.feature_importances_, features), reverse=True)] selected_features = features_scores[:CONFIG['preprocessing'] ['top_features_to_select']] selected_indeces = np.searchsorted(features, [f[0] for f in selected_features]) X_train = X_train[:, selected_indeces] X_test = X_test[:, selected_indeces] return X_train, y_train, X_test, selected_features if CONFIG['preprocessing']['use_feature_selection'] == 'chi2': algorithm = chi2 elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA': algorithm = f_classif else: raise ValueError("No implementation for " + str(CONFIG['preprocessing']['use_feature_selection'])) feature_selector = SelectKBest( algorithm, k=CONFIG['preprocessing']['top_features_to_select']) feature_selector.fit(X_train, y_train) X_train = feature_selector.fit_transform(X_train, y_train) X_test = feature_selector.transform(X_test) features = [ (feature, score) for (score, feature ) in sorted(zip(feature_selector.scores_, features), reverse=True) ] selected_features = features[:CONFIG['preprocessing'] ['top_features_to_select']] return X_train, y_train, X_test, selected_features
def extract(max_gram, feat_dims, save_model=False): print "extract feature" vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None, ngram_range=(1, max_gram), sublinear_tf = True ) vectorizer = vectorizer.fit(reviews_train + reviews_unsup) feats_train_ori = vectorizer.transform(reviews_train) feats_test_ori = vectorizer.transform(reviews_test) print "size of orginal train features", feats_train_ori.shape for feat_dim in feat_dims: print "perform feature selection" fselect = SelectKBest(chi2 , k=feat_dim) feats_train = fselect.fit_transform(feats_train_ori, labels_train) feats_test = fselect.transform(feats_test_ori) print "save features" np.savez("feats/%d_%d.npz" % (max_gram, feat_dim), feats_train=feats_train, feats_test=feats_test, labels_train=labels_train, labels_test=labels_test) if save_model: print "save models" with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout: pickle.dump(vectorizer, fout, -1) with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout: pickle.dump(fselect, fout, -1)
def build_dict_feature_spd(double_features): sentences_pos = [] ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.pos') with io.open(ff, 'r', encoding='UTF-8') as f: for line in tqdm.tqdm(f, desc="sentences pos"): # time.sleep(0.001) sentences_pos.append(line) sentences_neg = [] ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.neg') with io.open(ff, 'r', encoding='UTF-8') as f: for line in tqdm.tqdm(f, desc="sentences neg"): # time.sleep(0.001) sentences_neg.append(line) sentences = sentences_pos + sentences_neg y = [1] * (len(sentences_pos)) + [0] * (len(sentences_neg)) sentences_train, sentences_test, y_train, y_test = train_test_split( sentences, y, test_size=0.2, random_state=58) if model == "svm": X_train, vectorizer = build_dic_svm(sentences_train, double_features) X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer) elif model == "cnn" or model == "lstm": X_train, tokenizer = build_dic_nn(sentences=sentences_train, double_features=double_features) X_test, _ = build_dic_nn(sentences=sentences_test, double_features=double_features, tokenizer=tokenizer) if feature_selection: print("Doing feature selection") if hashing_trick: fselect = SelectKBest(chi2, k=9500) else: if negation: fselect = SelectKBest(chi2, k=9500) else: fselect = SelectKBest(chi2, k=8500) X_train = fselect.fit_transform(X_train, y_train) X_test = fselect.transform(X_test) return X_train, X_test, y_train, y_test
def reduce_dim(vec, num_dim, method, label=None): """ Dimension reduction. Two approaches are provided. SVD: Truncated SVD maps feature vectors into different subspaces. chi2: Chi-square independence test examine the pairwise dependence of features and labels """ print "Performing dimension reduction" # Reduce the dimensions using truncated SVD or Chi-Square independence test if method == "SVD": svd = TruncatedSVD(n_components=num_dim) vec = svd.fit_transform(vec) # test = svd.transform(vec) elif method == "chi2" or method == "f_classif": fselect = SelectKBest((chi2 if method == "chi2" else f_classif), k=num_dim) vec = fselect.fit_transform(vec, label) # test = fselect.transform(vec) return vec
features = vectorizer.transform(opinions) features_test = vectorizer.transform(opinions_test) # In[13]: print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2, k=10000) # In[14]: train_data_features = fselect.fit_transform(features, article["trend"]) test_data_features = fselect.transform(features_test) # # Train the model # In[128]: print "Training..." model1 = MultinomialNB(alpha=0.0005) model1.fit(train_data_features, article["trend"]) model2 = SGDClassifier(loss="modified_huber", n_iter=5, random_state=0, shuffle=True) model2.fit(train_data_features, article["trend"])
stemmer = english_stemmer #PorterStemmer() for word in txt: b.append(stemmer.stem(word)) # 5. Return a list of words return(b) clean_train_reviews=[] for txt in train['text']: clean_train_reviews.append("".join(cleanData(txt,True,True,True))) clean_test_reviews=[] for txt in test['text']: clean_test_reviews.append("".join(cleanData(txt,True,True,True))) vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ),sublinear_tf = True ) vectorizer = vectorizer.fit(clean_train_reviews) train_features = vectorizer.transform(clean_train_reviews) test_features = vectorizer.transform(clean_test_reviews) fselect = SelectKBest(chi2 , k=10000) train_features = fselect.fit_transform(train_features, train["author"]) test_features = fselect.transform(test_features) model1 = MultinomialNB(alpha=0.001) model1.fit( train_features, train["author"] ) pred_1 = model1.predict( test_features.toarray() ) print(pred_1)
clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review ))) print "Vectorizing..." vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ), sublinear_tf = True ) vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews) train_data_features = vectorizer.transform( clean_train_reviews ) test_data_features = vectorizer.transform( clean_test_reviews ) print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2 , k=70000) train_data_features = fselect.fit_transform(train_data_features, train["sentiment"]) test_data_features = fselect.transform(test_data_features) print "Training..." model1 = MultinomialNB(alpha=0.0005) model1.fit( train_data_features, train["sentiment"] ) model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True) model2.fit( train_data_features, train["sentiment"] ) p1 = model1.predict_proba( test_data_features )[:,1] p2 = model2.predict_proba( test_data_features )[:,1] print "Writing results..."
do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) selector = RFECV(estimator, step=1, cv=5, scoring='log_loss') X_train = selector.fit_transform(X_train, train_labels) print 'after feature elimination', X_train.shape X_test = selector.transform(X_test) do_feature_selection = False if do_feature_selection: ch2 = SelectKBest(chi2, k=4000) X_train = ch2.fit_transform(X_train, train_labels) X_test = ch2.transform(X_test) do_pca = False if do_pca: k = 100 add_pca_to_original = True X_train = X_train.toarray() X_test = X_test.toarray() pca = PCA(n_components=k, copy=True, whiten=False) X_train_pca = pca.fit_transform(X_train) X_test_pca = pca.transform(X_test) if add_pca_to_original: X_train = np.hstack((X_train, X_train_pca)) X_test = np.hstack((X_test, X_test_pca))
def main(): os.chdir("/Users/[email protected]/Desktop/workspace/sentiment.analysis") ##################### Initialization ##################### write_to_csv = False tune_parameter = False Mix = True # term_vector_type = {"TFIDF", "Binary", "Int", "Word2vec", "Word2vec_pretrained"} # {"TFIDF", "Int", "Binary"}: Bag-of-words model with {tf-idf, word counts, presence/absence} representation # {"Word2vec", "Word2vec_pretrained"}: Google word2vec representation {without, with} pre-trained models # Specify model_name if there's a pre-trained model to be loaded #vector_type = "TFIDF" vector_type = 'Word2vec_pretrained' #model_name = "selftrainBad.bin" model_name = "wiki.fr.vec" # model_type = {"bin", "reg"} # Specify whether pre-trained word2vec model is binary #model_type = "bin" # Parameters for word2vec # num_features need to be identical with the pre-trained model num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count to be included for training num_workers = 4 # Number of threads to run in parallel context = 4 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # training_model = {"RF", "NB", "SVM", "BT", "no"} training_model = "SVM" # feature scaling = {"standard", "signed", "unsigned", "no"} # Note: Scaling is needed for SVM scaling = "no" # dimension reduction = {"SVD", "chi2", "no"} # Note: For NB models, we cannot perform truncated SVD as it will make input negative # chi2 is the feature selectioin based on chi2 independence test dim_reduce = "no" num_dim = 200 ##################### End of Initialization ##################### print('parameter settings: ') print('vector_type:' + vector_type) print('training_model: ' + training_model) print('scaling: ' + scaling) print('dim_reduce: ' + dim_reduce ) ########################### Main Program ########################### train_list = [] test_list_t = [] test_list_h = [] test_list_c = [] word2vec_input = [] train_list2 = [] pred = [] language = 'french' train_language = 'german' test_language = 'french' trainFile = train_language + 'TrainData_100k.csv' trainFile2 = test_language + 'TrainData_100k.csv' ## testFile_t = test_language + 'TestData_cftwt.csv' testFile_h = test_language + 'TestData_cfdata.csv' testFile_c = test_language + 'TestData_deft.csv' #unlabFile = 'frenchUnlab.csv' train_data = pd.read_csv("data/" + trainFile, header=0, delimiter=",", quoting=0 )#, encoding='utf-8') if Mix == True: train_data2 = pd.read_csv("data/" + trainFile2, header=0, delimiter=",", quoting=0 ) test_data_t = pd.read_csv("data/" + testFile_t, header=0, delimiter=",", quoting=0)# , encoding='utf-8') test_data_h = pd.read_csv("data/" + testFile_h, header=0, delimiter=",", quoting=0)# , encoding='utf-8') test_data_c = pd.read_csv("data/" + testFile_c, header=0, delimiter=",", quoting=0)# , encoding='utf-8') # unlab_train_data = pd.read_csv("data/" + unlabFile, header=0, delimiter=",", quoting=0)# , encoding='utf-8') if vector_type == "Word2vec": unlab_train_data = pd.read_csv("data/frenchUnlabeledTrainData.csv", header=0, delimiter=",", quoting=0) tokenizer = nltk.data.load('tokenizers/punkt/'+ language+'.pickle') logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO) ground_truth_t = test_data_t.sentiment ground_truth_h = test_data_h.sentiment ground_truth_c = test_data_c.sentiment # Extract words from reviews # xrange is faster when iterating if vector_type == "Word2vec" or vector_type == "Word2vec_pretrained": for i in xrange(0, len(train_data.review)): if vector_type == "Word2vec": # Decode utf-8 coding first word2vec_input.extend(review_to_doublelist(train_data.review[i].decode("utf-8"), language, tokenizer )) # print train_data.id[i] train_list.append(clean_review(train_data.review[i], language, output_format="list" )) #if i%1000 == 0: #print "Cleaning training review", i if Mix == True: for i in xrange(0, len(train_data2.review)): # print train_data.id[i] train_list2.append(clean_review(train_data2.review[i], language, output_format="list" )) #if i%1000 == 0: #print "Cleaning training review", i if vector_type == "Word2vec": for i in xrange(0, len(unlab_train_data.review)): #print unlab_train_data.review[i] word2vec_input.extend(review_to_doublelist(unlab_train_data.review[i].decode("utf-8"), language, tokenizer)) #if i%1000 == 0: #print "Cleaning unlabeled training review", i for i in xrange(0, len(test_data_t.review)): test_list_t.append(clean_review(test_data_t.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i for i in xrange(0, len(test_data_h.review)): test_list_h.append(clean_review(test_data_h.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i for i in xrange(0, len(test_data_c.review)): test_list_c.append(clean_review(test_data_c.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i elif vector_type != "no": for i in xrange(0, len(train_data.review)): # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs train_list.append(clean_review(train_data.review[i], language) ) #if i%1000 == 0: # print "Cleaning training review", i for i in xrange(0, len(test_data.review)): # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs test_list.append(clean_review(test_data.review[i], language)) #if i%1000 == 0: # print "Cleaning test review", i # Generate vectors from words if vector_type == "Word2vec_pretrained" or vector_type == "Word2vec": if vector_type == "Word2vec_pretrained": print "Loading the pre-trained model" if model_name.endswith == ".bin": #model = word2vec.Word2Vec.load_word2vec_format(model_name, binary=True) model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True , unicode_errors='ignore') else: #model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=False , unicode_errors='ignore') train_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ train_language +'.vec', binary=False , unicode_errors='ignore') test_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ test_language +'.vec', binary=False , unicode_errors='ignore') if vector_type == "Word2vec": print "Training word2vec word vectors" model = word2vec.Word2Vec(word2vec_input, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling) # If no further training and only query is needed, this trims unnecessary memory model.init_sims(replace=True) # Save the model for later use word_vectors = model.wv model.save(model_name) print "Vectorizing training review" train_vec = gen_review_vecs(train_list, train_model, num_features) if Mix == True: train_vec2 = gen_review_vecs(train_list2, test_model, num_features) train_vec = np.append(train_vec , train_vec2 , axis = 0) #train_vec = np.concatenate((train_vec, train_vec2) , axis = 0) print "Vectorizing test review" test_vec_c = gen_review_vecs(test_list_c,test_model, num_features) test_vec_h = gen_review_vecs(test_list_h,test_model, num_features) test_vec_t = gen_review_vecs(test_list_t,test_model, num_features) elif vector_type != "no": if vector_type == "TFIDF": # Unit of gram is "word", only top 5000/10000 words are extracted count_vec = TfidfVectorizer(analyzer="word", max_features=10000, ngram_range=(1,2), sublinear_tf=True) elif vector_type == "Binary" or vector_type == "Int": count_vec = CountVectorizer(analyzer="word", max_features=10000, \ binary = (vector_type == "Binary"), \ ngram_range=(1,2)) # Return a scipy sparse term-document matrix print "Vectorizing input texts" train_vec = count_vec.fit_transform(train_list) test_vec_h = count_vec.transform(test_list_h) test_vec_t = count_vec.transform(test_list_t) test_vec_c = count_vec.transform(test_list_c) # Dimemsion Reduction if dim_reduce == "SVD": print "Performing dimension reduction" svd = TruncatedSVD(n_components = num_dim) train_vec = svd.fit_transform(train_vec) test_vec_h = svd.transform(test_vec_h) test_vec_t = svd.transform(test_vec_t) test_vec_c = svd.transform(test_vec_c) print "Explained variance ratio =", svd.explained_variance_ratio_.sum() elif dim_reduce == "chi2": print "Performing feature selection based on chi2 independence test" fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec_h = test_vec_h.toarray() test_vec_t = test_vec_t.toarray() test_vec_c = test_vec_c.toarray() # Feature Scaling if scaling != "no": if scaling == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned": scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) elif scaling == "signed": scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) print "Scaling vectors" train_vec = scaler.fit_transform(train_vec) test_vec = scaler.transform(test_vec) # Model training if training_model == "RF" or training_model == "BT": # Initialize the Random Forest or bagged tree based the model chosen rfc = RFC(n_estimators = 100, oob_score = True, \ max_features = (None if training_model=="BT" else "auto")) print "Training %s" % ("Random Forest" if training_model=="RF" else "bagged tree") rfc = rfc.fit(train_vec, train_data.sentiment) print "OOB Score =", rfc.oob_score_ pred = rfc.predict(test_vec) elif training_model == "NB": nb = naive_bayes.MultinomialNB() cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10) print "Training Naive Bayes" print "CV Score = ", cv_score.mean() nb = nb.fit(train_vec, train_data.sentiment) pred = nb.predict(test_vec) elif training_model == "SVM": svc = svm.LinearSVC() #svc = svm.SVC(kernel = 'linear', probability = True) #seems it takes so long time to train?? print 'complete 0' param = {'C': [1e15,1e13,1e11,1e9,1e7,1e5,1e3,1e1,1e-1,1e-3,1e-5]} print "Training SVM" if tune_parameter == True: svc = GridSearchCV(estimator=svc, param_grid = param, cv=10) #next 2 Lines are for enable probability svc = CalibratedClassifierCV(svc) #print 'complete 1' sentiment_array = [] for sent in train_data.sentiment: sentiment_array.append(sent) if Mix == True: for sent in train_data2.sentiment: sentiment_array.append(sent) svc = svc.fit(train_vec, sentiment_array) #svc = svc.fit(train_vec, train_data.sentiment) print 'complete 2' #pred_t = svc.predict(test_vec_t) #pred_h = svc.predict(test_vec_h) #pred_c = svc.predict(test_vec_c) #pred_proba_t = svc.predict_proba(test_vec_t) #pred1 = svc.predict_proba(test_vec) #print(pred1) #print(pred_proba_t) print('Accuracy on "cftwt.csv" dataset:') evaluate_on_testdata(test_vec_t, svc , ground_truth_t) print('Accuracy on "cfdata.csv" dataset:') evaluate_on_testdata(test_vec_h, svc , ground_truth_h) print('Accuracy on "deft.csv" dataset:') evaluate_on_testdata(test_vec_c, svc , ground_truth_c) print('training dataset is : ') if Mix: print "used Mixed datasets" print trainFile if tune_parameter == True: print "Optimized parameters:", svc.best_estimator_ #print the best parameter when using GridSearchCV print "Best CV score:", svc.best_score_ #filename =vector_type+ 'finalized_model.pkl' #s = pickle.dump(svc, open(filename, 'wb')) # Output the results if write_to_csv: output = pd.DataFrame(data = {"id": test_data.id, "sentiment": pred}) output.to_csv("data/" + vector_type +"submission.csv", index=False)
clean_test_reviews.append(" ".join(review_to_wordlist(review))) # In[ ]: vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 4), sublinear_tf=True) vectorizer = vectorizer.fit(clean_train_reviews) train_features = vectorizer.transform(clean_train_reviews) test_features = vectorizer.transform(clean_test_reviews) fselect = SelectKBest(chi2, k=10000) train_features = fselect.fit_transform(train_features, train["Rating"]) test_features = fselect.transform(test_features) # # Machine learning # In[ ]: classifiers = [ ('RandomForestClassifierG', RandomForestClassifier(n_jobs=-1, criterion='gini')), ('RandomForestClassifierE', RandomForestClassifier(n_jobs=-1, criterion='entropy')), ('AdaBoostClassifier', AdaBoostClassifier()), ('ExtraTreesClassifier', ExtraTreesClassifier(n_jobs=-1)), ('DecisionTreeClassifier', DecisionTreeClassifier()), ('LogisticRegression', LogisticRegression()),
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 4), sublinear_tf=True) vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews) train_data_features = vectorizer.transform(clean_train_reviews) test_data_features = vectorizer.transform(clean_test_reviews) print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2, k=70000) train_data_features = fselect.fit_transform(train_data_features, train["sentiment"]) test_data_features = fselect.transform(test_data_features) print "Training..." model1 = MultinomialNB(alpha=0.0005) model1.fit(train_data_features, train["sentiment"]) model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True) model2.fit(train_data_features, train["sentiment"]) p1 = model1.predict_proba(test_data_features)[:, 1] p2 = model2.predict_proba(test_data_features)[:, 1]
predsFM = model.predict(sparse_merge_test) print('[{}] Predict FM completed'.format(time.time() - start_time)) else: for i in range(rounds): model.fit(sparse_merge_train, y_train) predsFM = model.predict(sparse_merge_test) print('[{}] Iteration {}/{} -- RMSLE: {}'.format(time.time() - start_time, i + 1, rounds, rmse(predsFM, y_test))) del model gc.collect() if not SUBMIT_MODE: print("FM_FTRL dev RMSLE:", rmse(predsFM, y_test)) fselect = SelectKBest(f_regression, k=48000) train_features = fselect.fit_transform(sparse_merge_train, y_train) test_features = fselect.transform(sparse_merge_test) print('[{}] Select best completed'.format(time.time() - start_time)) del sparse_merge_train del sparse_merge_test gc.collect() print('[{}] Garbage collection'.format(time.time() - start_time)) tv = TfidfVectorizer(max_features=250000, ngram_range=(1, 3), stop_words=None) X_name_train = tv.fit_transform(df_train['name']) print('[{}] Finished TFIDF vectorize `name` (1/2)'.format(time.time() - start_time))
bootstrap=False, oob_score=False, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) selector = RFECV(estimator, step=1, cv=5, scoring='log_loss') X_train = selector.fit_transform(X_train, train_labels) print 'after feature elimination', X_train.shape X_test = selector.transform(X_test) do_feature_selection = False if do_feature_selection: ch2 = SelectKBest(chi2, k=4000) X_train = ch2.fit_transform(X_train, train_labels) X_test = ch2.transform(X_test) do_pca = False if do_pca: k = 100 add_pca_to_original = True X_train = X_train.toarray() X_test = X_test.toarray() pca = PCA(n_components=k, copy=True, whiten=False) X_train_pca = pca.fit_transform(X_train) X_test_pca = pca.transform(X_test) if add_pca_to_original: X_train = np.hstack((X_train, X_train_pca)) X_test = np.hstack((X_test, X_test_pca))
corpus = [] for question in df[0]: corpus.append(" ".join(text_to_wordlist(question, remove_stopwords=False))) vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=5000, ngram_range=(1, 4), sublinear_tf=True) vectorizer = vectorizer.fit(corpus) train_features = vectorizer.transform(corpus) fselect = SelectKBest(chi2, k=1000) train_features = fselect.fit_transform(train_features, cat) # using XGboost to train our model params = { 'objective': 'multi:softmax', 'eval_metric': 'merror', 'eta': 0.025, 'max_depth': 9, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5, 'num_class': 5, 'silent': 1 } import xgboost as xgb
idx_start += N_test print X_train.shape, y_train.shape print X_test.shape, y_test.shape print "start classification" # vectorization vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # feature reduction ch2 = SelectKBest(chi2, k="all") ch2.fit(X_train, y_train) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) # training clf = LinearSVC() clf.fit(X_train, y_train) if validation_mode == "train": X_test = X_train y_test = y_train # predict categories predicted = clf.predict(X_test) print numpy.mean(predicted == y_test) print metrics.classification_report(y_test, predicted)
train_vec = count_vec.fit_transform(train_list) test_vec = count_vec.transform(test_list) # Dimemsion Reduction if dim_reduce == "SVD": print "Performing dimension reduction" svd = TruncatedSVD(n_components = num_dim) train_vec = svd.fit_transform(train_vec) test_vec = svd.transform(test_vec) print "Explained variance ratio =", svd.explained_variance_ratio_.sum() elif dim_reduce == "chi2": print "Performing feature selection based on chi2 independence test" fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec = test_vec.toarray() # Feature Scaling if scaling != "no": if scaling == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned":
test_vec = count_vec.transform(test_list) # Dimension Reduction print("Start Dimension Reduction...") if dim_reduce == "SVD": print("Performing dimension reduction") svd = TruncatedSVD(n_components=num_dim) train_vec = svd.fit_transform(train_vec) test_vec = svd.transform(test_vec) print("Explained variance ratio =", svd.explained_variance_ratio_.sum()) elif dim_reduce == "chi2": print("Performing feature selection based on chi2 independence test") fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data_y) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec = test_vec.toarray() # Feature Scaling if scaling != "no": if scaling == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned": scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
print_step('Importing Data 3/13') tfidf_train2, tfidf_test2 = load_cache('text_tfidf') print_step('Importing Data 4/13') tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf') print_step('Importing Data 5/13') train = hstack((tfidf_train2, tfidf_train3)).tocsr() print_step('Importing Data 6/13') test = hstack((tfidf_test2, tfidf_test3)).tocsr() print(train.shape) print(test.shape) print_step('SelectKBest 1/2') fselect = SelectKBest(f_regression, k=100000) train = fselect.fit_transform(train, target) print_step('SelectKBest 2/2') test = fselect.transform(test) print(train.shape) print(test.shape) print_step('Importing Data 7/13') train = hstack((tfidf_train, train)).tocsr() print_step('Importing Data 8/13') test = hstack((tfidf_test, test)).tocsr() print(train.shape) print(test.shape) print_step('GC') del tfidf_test del tfidf_test2
def dimensionality_reduction(train_vec, test_vec, y_train_data): print("Performing feature selection based on chi2 independence test") fselect = SelectKBest(chi2, k=4500) train_vec = fselect.fit_transform(train_vec, y_train_data) test_vec = fselect.transform(test_vec) return train_vec, test_vec
print("Vectorizing input texts") train_vec = count_vec.fit_transform(train_list) test_vec = count_vec.transform(test_list) # Dimension Reduction if dim_reduce == "SVD": print("performing dimension reduction") svd = TruncatedSVD(n_components=num_dim) train_vec = svd.fit_transform(train_vec) test_vec = svd.transform(test_vec) print("Explained variance ratio =", svd.explained_variance_ratio_.sum()) elif dim_reduce == "chi2": print("performing feature selection based on chi2 independce test") fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec = test_vec.toarray() # Feature Scaling if scaling != "no": if scaler == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned": scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) elif scaling == "signed":
from sklearn import datasets from sklearn.feature_selection.univariate_selection import SelectKBest, chi2 iris = datasets.load_iris() k_best0 = SelectKBest(score_func=chi2, k=2) fit = k_best0.fit(iris.data, iris.target) print(fit.scores_) features = fit.transform(iris.data) print(features) k_best1 = SelectKBest(score_func=chi2, k=4) newX = k_best1.fit_transform(iris.data, iris.target) print(newX)