def feature_reduce_f_class_if(X, Y, num_features_to_keep): test = SelectKBest(score_func=f_classif, k=num_features_to_keep) fit = test.fit(X, Y) #return the data with reduced features return fit.transform(X)
def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = int(feat_select.split('-')[0]) selector = SelectKBest(k=n) import warnings with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] elif re.match('.*-randombest', feat_select) is not None: n = int(feat_select.split('-')[0]) from random import shuffle features = range(0, X.shape[1]) shuffle(features) features_selected = features[:n] return features_selected
def extract(max_gram, feat_dims, save_model=False): print "extract feature" vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None, ngram_range=(1, max_gram), sublinear_tf = True ) vectorizer = vectorizer.fit(reviews_train + reviews_unsup) feats_train_ori = vectorizer.transform(reviews_train) feats_test_ori = vectorizer.transform(reviews_test) print "size of orginal train features", feats_train_ori.shape for feat_dim in feat_dims: print "perform feature selection" fselect = SelectKBest(chi2 , k=feat_dim) feats_train = fselect.fit_transform(feats_train_ori, labels_train) feats_test = fselect.transform(feats_test_ori) print "save features" np.savez("feats/%d_%d.npz" % (max_gram, feat_dim), feats_train=feats_train, feats_test=feats_test, labels_train=labels_train, labels_test=labels_test) if save_model: print "save models" with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout: pickle.dump(vectorizer, fout, -1) with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout: pickle.dump(fselect, fout, -1)
def feature_reduce(X, Y, num_features_to_keep): #use the chi-squared method to reduce features and reshape data test = SelectKBest(score_func=chi2, k=num_features_to_keep) fit = test.fit(X, Y) #return the data with reduced features return fit.transform(X)
def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def corr_matrix_of_important_words(term_doc_mat, word_list, scores, n_features_to_keep): selector = SelectKBest(k=n_features_to_keep).fit(term_doc_mat, scores) informative_words_index = selector.get_support(indices=True) labels = [word_list[i] for i in informative_words_index] data = pd.DataFrame(term_doc_mat[:, informative_words_index].todense(), columns=labels) data['Score'] = df_one_company.Rating return (data.corr())
def feature_selection(feat_select, X, y): """" Implements various kinds of feature selection """ # K-best if re.match('.*-best', feat_select) is not None: n = feat_select.split('-')[0] selector = SelectKBest(k=int(n)) features_selected = np.where( selector.fit(X, y).get_support() == True)[0] return features_selected
def build_dict_feature_imdb(double_features): sentences_train = [] for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_pos, '*.txt')), desc="train pos"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_train.append(f.readline().strip()) for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_neg, '*.txt')), desc="train neg"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_train.append(f.readline().strip()) sentences_test = [] for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_pos, '*.txt')), desc="test pos"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_test.append(f.readline().strip()) for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_neg, '*.txt')), desc="test neg"): with io.open(ff, 'r', encoding='utf-8') as f: sentences_test.append(f.readline().strip()) if model == "svm": X_train, vectorizer_fitted = build_dic_svm(sentences_train, double_features) X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer_fitted) n = X_train.shape[0] / 2 y_train = [1] * n + [0] * n y_test = [1] * n + [0] * n elif model == "cnn" or model == "lstm": X_train, tokenizer = build_dic_nn(sentences=sentences_train, double_features=double_features) X_test, _ = build_dic_nn(sentences=sentences_test, double_features=double_features, tokenizer=tokenizer) n = len(X_train) / 2 y_train = [1] * n + [0] * n y_test = [1] * n + [0] * n if feature_selection: print("Doing feature selection") if hashing_trick: fselect = SelectKBest(chi2, k=200000) else: if negation: fselect = SelectKBest(chi2, k=200000) else: fselect = SelectKBest(chi2, k=200000) X_train = fselect.fit_transform(X_train, y_train) X_test = fselect.transform(X_test) return X_train, X_test, y_train, y_test
def fit(self, k=100, percent=None): selector = SelectKBest(k=k) selector.fit(self.doc_vecs.todense(), np.asarray(self.labels)) scores = selector.scores_ indices = np.argsort(scores) if k is not None: select = k elif percent is not None: select = int(len(scores) * percent) else: raise ValueError('One of `k` or `percent` parameter must be not None.') indices = indices[:select] self._filtered_words = [self.words[i] for i in indices]
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def get_best_features(self, data, labels, k=3): ''' Using the scikit-learn library, narrow down feature set. ''' num_feat = len(data.columns) while num_feat > k: num_feat = max(k, num_feat // 2) selector = SelectKBest(f_classif, k=num_feat) selector.fit(data, labels) chosen = selector.get_support() if sum(selector._pvalues[chosen]) > 0: data = data[data.columns[chosen]] else: # Many of our p-vals are zero. Accept all. data = data[data.columns[selector._pvalues == 0]] num_feat = k return data.columns
def reduce_dim(vec, num_dim, method, label=None): """ Dimension reduction. Two approaches are provided. SVD: Truncated SVD maps feature vectors into different subspaces. chi2: Chi-square independence test examine the pairwise dependence of features and labels """ print "Performing dimension reduction" # Reduce the dimensions using truncated SVD or Chi-Square independence test if method == "SVD": svd = TruncatedSVD(n_components=num_dim) vec = svd.fit_transform(vec) # test = svd.transform(vec) elif method == "chi2" or method == "f_classif": fselect = SelectKBest((chi2 if method == "chi2" else f_classif), k=num_dim) vec = fselect.fit_transform(vec, label) # test = fselect.transform(vec) return vec
def apply_feature_selection(X_train, y_train, X_test, features): if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest': clf = RandomForestClassifier() clf = clf.fit(X_train.toarray(), y_train) features_scores = [(feature, score) for (score, feature) in sorted( zip(clf.feature_importances_, features), reverse=True)] selected_features = features_scores[:CONFIG['preprocessing'] ['top_features_to_select']] selected_indeces = np.searchsorted(features, [f[0] for f in selected_features]) X_train = X_train[:, selected_indeces] X_test = X_test[:, selected_indeces] return X_train, y_train, X_test, selected_features if CONFIG['preprocessing']['use_feature_selection'] == 'chi2': algorithm = chi2 elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA': algorithm = f_classif else: raise ValueError("No implementation for " + str(CONFIG['preprocessing']['use_feature_selection'])) feature_selector = SelectKBest( algorithm, k=CONFIG['preprocessing']['top_features_to_select']) feature_selector.fit(X_train, y_train) X_train = feature_selector.fit_transform(X_train, y_train) X_test = feature_selector.transform(X_test) features = [ (feature, score) for (score, feature ) in sorted(zip(feature_selector.scores_, features), reverse=True) ] selected_features = features[:CONFIG['preprocessing'] ['top_features_to_select']] return X_train, y_train, X_test, selected_features
def build_dict_feature_spd(double_features): sentences_pos = [] ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.pos') with io.open(ff, 'r', encoding='UTF-8') as f: for line in tqdm.tqdm(f, desc="sentences pos"): # time.sleep(0.001) sentences_pos.append(line) sentences_neg = [] ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.neg') with io.open(ff, 'r', encoding='UTF-8') as f: for line in tqdm.tqdm(f, desc="sentences neg"): # time.sleep(0.001) sentences_neg.append(line) sentences = sentences_pos + sentences_neg y = [1] * (len(sentences_pos)) + [0] * (len(sentences_neg)) sentences_train, sentences_test, y_train, y_test = train_test_split( sentences, y, test_size=0.2, random_state=58) if model == "svm": X_train, vectorizer = build_dic_svm(sentences_train, double_features) X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer) elif model == "cnn" or model == "lstm": X_train, tokenizer = build_dic_nn(sentences=sentences_train, double_features=double_features) X_test, _ = build_dic_nn(sentences=sentences_test, double_features=double_features, tokenizer=tokenizer) if feature_selection: print("Doing feature selection") if hashing_trick: fselect = SelectKBest(chi2, k=9500) else: if negation: fselect = SelectKBest(chi2, k=9500) else: fselect = SelectKBest(chi2, k=8500) X_train = fselect.fit_transform(X_train, y_train) X_test = fselect.transform(X_test) return X_train, X_test, y_train, y_test
print_step('Importing Data 3/13') tfidf_train2, tfidf_test2 = load_cache('text_tfidf') print_step('Importing Data 4/13') tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf') print_step('Importing Data 5/13') train = hstack((tfidf_train2, tfidf_train3)).tocsr() print_step('Importing Data 6/13') test = hstack((tfidf_test2, tfidf_test3)).tocsr() print(train.shape) print(test.shape) print_step('SelectKBest 1/2') fselect = SelectKBest(f_regression, k=100000) train = fselect.fit_transform(train, target) print_step('SelectKBest 2/2') test = fselect.transform(test) print(train.shape) print(test.shape) print_step('Importing Data 7/13') train = hstack((tfidf_train, train)).tocsr() print_step('Importing Data 8/13') test = hstack((tfidf_test, test)).tocsr() print(train.shape) print(test.shape) print_step('GC') del tfidf_test
'loader__loader': 'bids-meg', 'loader__bids_win': '700', 'loader__task': 'reftep', 'loader__load_fx': 'reftep-iplv', 'fetch__subject_names': ['sub-1'], 'fetch__prepro': [Transformer()], 'prepro': ['sample_slicer', 'target_transformer'], 'target_transformer__fx': lambda x: np.log(x), 'balancer__attr': 'all', 'estimator': [('fsel', SelectKBest(k=50, score_func=f_regression)), ('clf', SVR(C=1, kernel='linear'))], 'cv': ShuffleSplit, 'cv__n_splits': 10, #'cv__test_size': 0.25, 'analysis__scoring': ['r2', 'explained_variance'], 'analysis': RoiRegression, 'analysis__n_jobs': -1, 'analysis__permutation': 0, 'analysis__verbose': 0,
for review in test['Reviews']: clean_test_reviews.append(" ".join(review_to_wordlist(review))) # In[ ]: vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 4), sublinear_tf=True) vectorizer = vectorizer.fit(clean_train_reviews) train_features = vectorizer.transform(clean_train_reviews) test_features = vectorizer.transform(clean_test_reviews) fselect = SelectKBest(chi2, k=10000) train_features = fselect.fit_transform(train_features, train["Rating"]) test_features = fselect.transform(test_features) # # Machine learning # In[ ]: classifiers = [ ('RandomForestClassifierG', RandomForestClassifier(n_jobs=-1, criterion='gini')), ('RandomForestClassifierE', RandomForestClassifier(n_jobs=-1, criterion='entropy')), ('AdaBoostClassifier', AdaBoostClassifier()), ('ExtraTreesClassifier', ExtraTreesClassifier(n_jobs=-1)), ('DecisionTreeClassifier', DecisionTreeClassifier()),
def validate(params): transf_type = params['transf_type'] if transf_type == 'drop': transf = FunctionTransformer(drop_transform, validate=False) elif transf_type == 'dr+inp+sc+pca': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=params['n_pca_components']), ) elif transf_type == 'dr+inp': transf = make_pipeline( drop_transform, SimpleImputer(), ) elif transf_type == 'dr+inp+sc': transf = make_pipeline(drop_transform, SimpleImputer(), StandardScaler()) elif transf_type == 'union': transf = create_union_transf(params) elif transf_type == 'poly_kbest': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PolynomialFeatures(degree=2, interaction_only=True), SelectKBest(f_regression, params['best_features']), ) else: raise AttributeError(f'unknown transformer type: {transf_type}') est_type = params['est_type'] if est_type == 'xgboost': est = create_xgb_est(params) elif est_type == 'gblinear': est = create_gblinear_est(params) elif est_type == 'exttree': est = ExtraTreesRegressor(n_estimators=params['n_estimators'], n_jobs=-1) elif est_type == 'gp': est = GaussianProcessRegressor() elif est_type == 'ridge': est = Ridge(alpha=params['alpha']) else: raise AttributeError(f'unknown estimator type: {est_type}') if params['bagging']: BaggingRegressor(est, n_estimators=params['n_bag_estimators'], max_features=1., max_samples=1.) pl = make_pipeline(transf, est) if params['per_group_regr']: pl = PerGroupRegressor(estimator=pl, split_condition=['os', 'cpuFreq', 'memSize_MB'], n_jobs=1, verbose=1) return cv_test(pl, n_folds=params['n_folds'])
'_train.npz') # filepaths.append(feature_set_path + 'bigramOnlyTfidfWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyBinaryWordData' + tag + '_train.npz') # filepaths.append(feature_set_path + 'trigramOnlyTfidfWordData' + tag + '_train.npz') for file in filepaths: print file print tag Xn = csr_matrix(np.array((0, 0))) yn = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') print Counter(yn) Xn = load_sparse_csr(file) Xn = SelectKBest(score_func=chi2, k=min(200000, int(Xn.shape[1] * (perc / 100.0)))).fit_transform(Xn, yn) if split: sss = StratifiedShuffleSplit(yn, 1, test_size=0.75) for train, test in sss: Xn, yn = Xn[train], yn[train] parameter_tuning(Xn, yn, scale=-1) if sparse_2_tests: filepaths = list() # filepaths.append(feature_set_path+ 'binaryCharacterData' + tag + '_train.npz') # filepaths.append(feature_set_path+ 'tfidfCharacterData' + tag + '_train.npz') # # filepaths.append(feature_set_path+ 'binaryCharacterSkipgramData' + tag + '_train.npz')
print('[{}] Train FM completed'.format(time.time() - start_time)) predsFM = model.predict(sparse_merge_test) print('[{}] Predict FM completed'.format(time.time() - start_time)) else: for i in range(rounds): model.fit(sparse_merge_train, y_train) predsFM = model.predict(sparse_merge_test) print('[{}] Iteration {}/{} -- RMSLE: {}'.format(time.time() - start_time, i + 1, rounds, rmse(predsFM, y_test))) del model gc.collect() if not SUBMIT_MODE: print("FM_FTRL dev RMSLE:", rmse(predsFM, y_test)) fselect = SelectKBest(f_regression, k=48000) train_features = fselect.fit_transform(sparse_merge_train, y_train) test_features = fselect.transform(sparse_merge_test) print('[{}] Select best completed'.format(time.time() - start_time)) del sparse_merge_train del sparse_merge_test gc.collect() print('[{}] Garbage collection'.format(time.time() - start_time)) tv = TfidfVectorizer(max_features=250000, ngram_range=(1, 3), stop_words=None) X_name_train = tv.fit_transform(df_train['name'])
X_test = scaler.transform(X_test.toarray()) do_feature_elimination = False if do_feature_elimination: estimator = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) selector = RFECV(estimator, step=1, cv=5, scoring='log_loss') X_train = selector.fit_transform(X_train, train_labels) print 'after feature elimination', X_train.shape X_test = selector.transform(X_test) do_feature_selection = False if do_feature_selection: ch2 = SelectKBest(chi2, k=4000) X_train = ch2.fit_transform(X_train, train_labels) X_test = ch2.transform(X_test) do_pca = False if do_pca: k = 100 add_pca_to_original = True X_train = X_train.toarray() X_test = X_test.toarray() pca = PCA(n_components=k, copy=True, whiten=False) X_train_pca = pca.fit_transform(X_train) X_test_pca = pca.transform(X_test) if add_pca_to_original: X_train = np.hstack((X_train, X_train_pca))
def run(n_jobs): path = "/media/robbis/Seagate_Pt1/data/working_memory/" conf_file = "%s/data/working_memory.conf" % (path) ### Load datasets ### iterator_kwargs = { "loader__img_pattern": [ #'power_parcel.mat', 'power_normalized.mat', #'connectivity_matrix.mat' 'mpsi_normalized.mat' ], "fetch__prepro": [['none'], ['none']], "loader__task": ["POWER", "CONN"] } config_kwargs = { 'loader': DataLoader, 'loader__configuration_file': conf_file, 'loader__loader': 'mat', 'loader__task': 'POWER', #'fetch__n_subjects': 57, "loader__data_path": "%s/data/" % (path), "loader__subjects": "%s/data/participants.csv" % (path), } iterator = AnalysisIterator(iterator_kwargs, AnalysisConfigurator, config_kwargs=config_kwargs, kind='list') ds_list = [generate(configurator) for configurator in iterator] for i, ds in enumerate(ds_list): ds_ = ds.copy() if i == 0: k = np.arange(1, 88, 10) ds_ = DatasetFxNormalizer(ds_fx=np.mean).transform(ds_) else: k = np.arange(1, 400, 50) #ds_ = DatasetFxNormalizer(ds_fx=np.mean).transform(ds_) _default_options = { #'sample_slicer__targets' : [['0back', '2back'], ['0back', 'rest'], ['rest', '2back']], #'kwargs__ds': ds_list, 'sample_slicer__targets': [['0back'], ['2back']], 'target_transformer__attr': [ 'accuracy_0back_both', 'accuracy_2back_both', 'rt_0back_both', 'rt_2back_both' ], 'sample_slicer__band': [['alpha'], ['beta'], ['theta'], ['gamma']], 'estimator__fsel__k': k, 'clf__C': [1, 10, 100], 'clf__kernel': ['linear', 'rbf'] } _default_config = { 'prepro': ['sample_slicer', 'target_transformer'], 'sample_slicer__band': ['gamma'], 'sample_slicer__targets': ['0back', '2back'], 'estimator': [('fsel', SelectKBest(score_func=f_regression, k=5)), ('clf', SVR(C=10, kernel='linear'))], 'estimator__clf__C': 1, 'estimator__clf__kernel': 'linear', 'cv': GroupShuffleSplit, 'cv__n_splits': 75, 'cv__test_size': 0.25, 'analysis_scoring': ['r2', 'neg_mean_squared_error'], 'analysis': RoiRegression, 'analysis__n_jobs': n_jobs, 'analysis__permutation': 0, 'analysis__verbose': 0, 'kwargs__roi': ['matrix_values'], 'kwargs__cv_attr': 'subjects', } iterator = AnalysisIterator(_default_options, AnalysisConfigurator, config_kwargs=_default_config) for conf in iterator: kwargs = conf._get_kwargs() a = AnalysisPipeline(conf, name="triton+behavioural").fit(ds_, **kwargs) a.save() del a
]) ds = loader.fetch(prepro=prepro) _default_options = { 'sample_slicer__targets': [['0back', '2back']], 'sample_slicer__band': [[c] for c in np.unique(ds.sa.band)], 'estimator__fsel__k': np.arange(1, 1200, 50), } _default_config = { 'prepro': ['sample_slicer'], #'ds_normalizer__ds_fx': np.std, 'sample_slicer__band': ['gamma'], 'sample_slicer__targets': ['0back', '2back'], 'estimator': [('fsel', SelectKBest(k=150)), ('clf', SVC(C=1, kernel='linear'))], 'estimator__clf__C': 1, 'estimator__clf__kernel': 'linear', 'cv': GroupShuffleSplit, 'cv__n_splits': 75, 'cv__test_size': 0.25, 'scores': ['accuracy'], 'analysis': RoiDecoding, 'analysis__n_jobs': -1, 'analysis__permutation': 0, 'analysis__verbose': 0, 'kwargs__roi': ['matrix_values'], 'kwargs__cv_attr': 'subjects', }
def dimensionality_reduction(train_vec, test_vec, y_train_data): print("Performing feature selection based on chi2 independence test") fselect = SelectKBest(chi2, k=4500) train_vec = fselect.fit_transform(train_vec, y_train_data) test_vec = fselect.transform(test_vec) return train_vec, test_vec
print "Vectorizing input texts" train_vec = count_vec.fit_transform(train_list) test_vec = count_vec.transform(test_list) # Dimemsion Reduction if dim_reduce == "SVD": print "Performing dimension reduction" svd = TruncatedSVD(n_components = num_dim) train_vec = svd.fit_transform(train_vec) test_vec = svd.transform(test_vec) print "Explained variance ratio =", svd.explained_variance_ratio_.sum() elif dim_reduce == "chi2": print "Performing feature selection based on chi2 independence test" fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec = test_vec.toarray() # Feature Scaling if scaling != "no": if scaling == "standard": scaler = preprocessing.StandardScaler() else:
# print('Shape of Y:', Y.shape) # print('first row: ', Y[0]) # SCORER scorer = make_scorer(score_func=singleLabelScore, greater_is_better=False) # PREPROCESSING # SCALING minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0)) #normalizer = skprep.Normalizer() columnDeleter = fs.FeatureDeleter() # FEATURE SELECTION varianceThresholdSelector = VarianceThreshold(threshold=(0)) percentileSelector = SelectPercentile(score_func=f_classif, percentile=20) kBestSelector = SelectKBest(f_classif, 1000) # FEATURE EXTRACTION #rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)]) nmf = NMF(n_components=150) pca = PCA(n_components=80) sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2) kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR() # CLASSIFIERS
sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42) y_train = [] y_test = [] for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) train, test, terms = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list, test_list) selector2 = SelectKBest(score_func=chi2, k=min(50, train.shape[1])).fit(train,y_train) ind = [zero_based_index for zero_based_index in list(selector2.get_support(indices=True))] print np.asarray(terms)[selector2.get_support()]
'motor_resp': ["P", "S"], 'evidence': [5] }, #'balancer__balancer': RandomUnderSampler(sampling_strategy={"P": 20, "S": 20}, return_indices=True), 'kwargs__roi_values': [('decision', [1]), ('decision', [2]), ('decision', [3]), ('decision', [4]), ('decision', [5]), ('motor+resp', [1]), ('motor+resp', [2]), ('motor+resp', [3]), ('motor+resp', [4]), ('motor+resp', [5])], } ] _default_config = { 'prepro': ['target_transformer', 'sample_slicer', 'balancer'], "balancer__attr": 'subject', 'estimator': [('fsel', SelectKBest(k=50)), ('clf', SVC(C=1, kernel='linear'))], 'estimator__clf__C': 1, 'estimator__clf__kernel': 'linear', 'cv': LeaveOneGroupOut, 'scores': ['accuracy'], 'analysis': RoiDecoding, 'analysis__n_jobs': -1, 'analysis__permutation': 0, 'analysis__verbose': 0, #'kwargs__roi': labels, #'kwargs__roi_values': [('image+type', [2])], #'kwargs__prepro': ['feature_normalizer', 'sample_normalizer'], 'kwargs__cv_attr': 'subject' }
def main(): os.chdir("/Users/[email protected]/Desktop/workspace/sentiment.analysis") ##################### Initialization ##################### write_to_csv = False tune_parameter = False Mix = True # term_vector_type = {"TFIDF", "Binary", "Int", "Word2vec", "Word2vec_pretrained"} # {"TFIDF", "Int", "Binary"}: Bag-of-words model with {tf-idf, word counts, presence/absence} representation # {"Word2vec", "Word2vec_pretrained"}: Google word2vec representation {without, with} pre-trained models # Specify model_name if there's a pre-trained model to be loaded #vector_type = "TFIDF" vector_type = 'Word2vec_pretrained' #model_name = "selftrainBad.bin" model_name = "wiki.fr.vec" # model_type = {"bin", "reg"} # Specify whether pre-trained word2vec model is binary #model_type = "bin" # Parameters for word2vec # num_features need to be identical with the pre-trained model num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count to be included for training num_workers = 4 # Number of threads to run in parallel context = 4 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # training_model = {"RF", "NB", "SVM", "BT", "no"} training_model = "SVM" # feature scaling = {"standard", "signed", "unsigned", "no"} # Note: Scaling is needed for SVM scaling = "no" # dimension reduction = {"SVD", "chi2", "no"} # Note: For NB models, we cannot perform truncated SVD as it will make input negative # chi2 is the feature selectioin based on chi2 independence test dim_reduce = "no" num_dim = 200 ##################### End of Initialization ##################### print('parameter settings: ') print('vector_type:' + vector_type) print('training_model: ' + training_model) print('scaling: ' + scaling) print('dim_reduce: ' + dim_reduce ) ########################### Main Program ########################### train_list = [] test_list_t = [] test_list_h = [] test_list_c = [] word2vec_input = [] train_list2 = [] pred = [] language = 'french' train_language = 'german' test_language = 'french' trainFile = train_language + 'TrainData_100k.csv' trainFile2 = test_language + 'TrainData_100k.csv' ## testFile_t = test_language + 'TestData_cftwt.csv' testFile_h = test_language + 'TestData_cfdata.csv' testFile_c = test_language + 'TestData_deft.csv' #unlabFile = 'frenchUnlab.csv' train_data = pd.read_csv("data/" + trainFile, header=0, delimiter=",", quoting=0 )#, encoding='utf-8') if Mix == True: train_data2 = pd.read_csv("data/" + trainFile2, header=0, delimiter=",", quoting=0 ) test_data_t = pd.read_csv("data/" + testFile_t, header=0, delimiter=",", quoting=0)# , encoding='utf-8') test_data_h = pd.read_csv("data/" + testFile_h, header=0, delimiter=",", quoting=0)# , encoding='utf-8') test_data_c = pd.read_csv("data/" + testFile_c, header=0, delimiter=",", quoting=0)# , encoding='utf-8') # unlab_train_data = pd.read_csv("data/" + unlabFile, header=0, delimiter=",", quoting=0)# , encoding='utf-8') if vector_type == "Word2vec": unlab_train_data = pd.read_csv("data/frenchUnlabeledTrainData.csv", header=0, delimiter=",", quoting=0) tokenizer = nltk.data.load('tokenizers/punkt/'+ language+'.pickle') logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO) ground_truth_t = test_data_t.sentiment ground_truth_h = test_data_h.sentiment ground_truth_c = test_data_c.sentiment # Extract words from reviews # xrange is faster when iterating if vector_type == "Word2vec" or vector_type == "Word2vec_pretrained": for i in xrange(0, len(train_data.review)): if vector_type == "Word2vec": # Decode utf-8 coding first word2vec_input.extend(review_to_doublelist(train_data.review[i].decode("utf-8"), language, tokenizer )) # print train_data.id[i] train_list.append(clean_review(train_data.review[i], language, output_format="list" )) #if i%1000 == 0: #print "Cleaning training review", i if Mix == True: for i in xrange(0, len(train_data2.review)): # print train_data.id[i] train_list2.append(clean_review(train_data2.review[i], language, output_format="list" )) #if i%1000 == 0: #print "Cleaning training review", i if vector_type == "Word2vec": for i in xrange(0, len(unlab_train_data.review)): #print unlab_train_data.review[i] word2vec_input.extend(review_to_doublelist(unlab_train_data.review[i].decode("utf-8"), language, tokenizer)) #if i%1000 == 0: #print "Cleaning unlabeled training review", i for i in xrange(0, len(test_data_t.review)): test_list_t.append(clean_review(test_data_t.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i for i in xrange(0, len(test_data_h.review)): test_list_h.append(clean_review(test_data_h.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i for i in xrange(0, len(test_data_c.review)): test_list_c.append(clean_review(test_data_c.review[i], language, output_format="list")) #if i%1000 == 0: #print "Cleaning test review", i elif vector_type != "no": for i in xrange(0, len(train_data.review)): # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs train_list.append(clean_review(train_data.review[i], language) ) #if i%1000 == 0: # print "Cleaning training review", i for i in xrange(0, len(test_data.review)): # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs test_list.append(clean_review(test_data.review[i], language)) #if i%1000 == 0: # print "Cleaning test review", i # Generate vectors from words if vector_type == "Word2vec_pretrained" or vector_type == "Word2vec": if vector_type == "Word2vec_pretrained": print "Loading the pre-trained model" if model_name.endswith == ".bin": #model = word2vec.Word2Vec.load_word2vec_format(model_name, binary=True) model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True , unicode_errors='ignore') else: #model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=False , unicode_errors='ignore') train_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ train_language +'.vec', binary=False , unicode_errors='ignore') test_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ test_language +'.vec', binary=False , unicode_errors='ignore') if vector_type == "Word2vec": print "Training word2vec word vectors" model = word2vec.Word2Vec(word2vec_input, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling) # If no further training and only query is needed, this trims unnecessary memory model.init_sims(replace=True) # Save the model for later use word_vectors = model.wv model.save(model_name) print "Vectorizing training review" train_vec = gen_review_vecs(train_list, train_model, num_features) if Mix == True: train_vec2 = gen_review_vecs(train_list2, test_model, num_features) train_vec = np.append(train_vec , train_vec2 , axis = 0) #train_vec = np.concatenate((train_vec, train_vec2) , axis = 0) print "Vectorizing test review" test_vec_c = gen_review_vecs(test_list_c,test_model, num_features) test_vec_h = gen_review_vecs(test_list_h,test_model, num_features) test_vec_t = gen_review_vecs(test_list_t,test_model, num_features) elif vector_type != "no": if vector_type == "TFIDF": # Unit of gram is "word", only top 5000/10000 words are extracted count_vec = TfidfVectorizer(analyzer="word", max_features=10000, ngram_range=(1,2), sublinear_tf=True) elif vector_type == "Binary" or vector_type == "Int": count_vec = CountVectorizer(analyzer="word", max_features=10000, \ binary = (vector_type == "Binary"), \ ngram_range=(1,2)) # Return a scipy sparse term-document matrix print "Vectorizing input texts" train_vec = count_vec.fit_transform(train_list) test_vec_h = count_vec.transform(test_list_h) test_vec_t = count_vec.transform(test_list_t) test_vec_c = count_vec.transform(test_list_c) # Dimemsion Reduction if dim_reduce == "SVD": print "Performing dimension reduction" svd = TruncatedSVD(n_components = num_dim) train_vec = svd.fit_transform(train_vec) test_vec_h = svd.transform(test_vec_h) test_vec_t = svd.transform(test_vec_t) test_vec_c = svd.transform(test_vec_c) print "Explained variance ratio =", svd.explained_variance_ratio_.sum() elif dim_reduce == "chi2": print "Performing feature selection based on chi2 independence test" fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec_h = test_vec_h.toarray() test_vec_t = test_vec_t.toarray() test_vec_c = test_vec_c.toarray() # Feature Scaling if scaling != "no": if scaling == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned": scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) elif scaling == "signed": scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) print "Scaling vectors" train_vec = scaler.fit_transform(train_vec) test_vec = scaler.transform(test_vec) # Model training if training_model == "RF" or training_model == "BT": # Initialize the Random Forest or bagged tree based the model chosen rfc = RFC(n_estimators = 100, oob_score = True, \ max_features = (None if training_model=="BT" else "auto")) print "Training %s" % ("Random Forest" if training_model=="RF" else "bagged tree") rfc = rfc.fit(train_vec, train_data.sentiment) print "OOB Score =", rfc.oob_score_ pred = rfc.predict(test_vec) elif training_model == "NB": nb = naive_bayes.MultinomialNB() cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10) print "Training Naive Bayes" print "CV Score = ", cv_score.mean() nb = nb.fit(train_vec, train_data.sentiment) pred = nb.predict(test_vec) elif training_model == "SVM": svc = svm.LinearSVC() #svc = svm.SVC(kernel = 'linear', probability = True) #seems it takes so long time to train?? print 'complete 0' param = {'C': [1e15,1e13,1e11,1e9,1e7,1e5,1e3,1e1,1e-1,1e-3,1e-5]} print "Training SVM" if tune_parameter == True: svc = GridSearchCV(estimator=svc, param_grid = param, cv=10) #next 2 Lines are for enable probability svc = CalibratedClassifierCV(svc) #print 'complete 1' sentiment_array = [] for sent in train_data.sentiment: sentiment_array.append(sent) if Mix == True: for sent in train_data2.sentiment: sentiment_array.append(sent) svc = svc.fit(train_vec, sentiment_array) #svc = svc.fit(train_vec, train_data.sentiment) print 'complete 2' #pred_t = svc.predict(test_vec_t) #pred_h = svc.predict(test_vec_h) #pred_c = svc.predict(test_vec_c) #pred_proba_t = svc.predict_proba(test_vec_t) #pred1 = svc.predict_proba(test_vec) #print(pred1) #print(pred_proba_t) print('Accuracy on "cftwt.csv" dataset:') evaluate_on_testdata(test_vec_t, svc , ground_truth_t) print('Accuracy on "cfdata.csv" dataset:') evaluate_on_testdata(test_vec_h, svc , ground_truth_h) print('Accuracy on "deft.csv" dataset:') evaluate_on_testdata(test_vec_c, svc , ground_truth_c) print('training dataset is : ') if Mix: print "used Mixed datasets" print trainFile if tune_parameter == True: print "Optimized parameters:", svc.best_estimator_ #print the best parameter when using GridSearchCV print "Best CV score:", svc.best_score_ #filename =vector_type+ 'finalized_model.pkl' #s = pickle.dump(svc, open(filename, 'wb')) # Output the results if write_to_csv: output = pd.DataFrame(data = {"id": test_data.id, "sentiment": pred}) output.to_csv("data/" + vector_type +"submission.csv", index=False)
print "Vectorizing..." vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 3), sublinear_tf=True) vectorizer = vectorizer.fit(opinions) features = vectorizer.transform(opinions) features_test = vectorizer.transform(opinions_test) # In[13]: print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2, k=10000) # In[14]: train_data_features = fselect.fit_transform(features, article["trend"]) test_data_features = fselect.transform(features_test) # # Train the model # In[128]: print "Training..." model1 = MultinomialNB(alpha=0.0005)
("svc_3",svm.SVC(gamma=.1, degree=3, kernel="rbf", C=10)),]), Pipeline([("rfe_Lsvc", RFE(estimator=svm.LinearSVC(), n_features_to_select=240,step=1)), ("svc_5",svm.SVC(C=1000, gamma=.1, degree=5, kernel="rbf")),]), Pipeline([("rfe_Lsvc", RFE(estimator=svm.LinearSVC(), n_features_to_select=282,step=1)), ("svc",svm.SVC()),]), Pipeline([("85_best",SelectKBest(k=100)), ("svc",svm.SVC(C=.01)),]), Pipeline([("normalize", StandardScaler()), ("grid_search_svm", GridSearchCV( svm.SVC(), { 'C': 10**np.arange(5), 'gamma': [0, 1e-5, 1e-3, 1e-1,], 'kernel': ['linear','rbf'], "degree":range(1,10), }, cv=5, scoring="roc_auc", n_jobs=-1))]), ] if __name__ == "__main__":
idx_end = idx_start + N_test y_test[idx_start:idx_end] = cat idx_start += N_test print X_train.shape, y_train.shape print X_test.shape, y_test.shape print "start classification" # vectorization vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1)) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) # feature reduction ch2 = SelectKBest(chi2, k="all") ch2.fit(X_train, y_train) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) # training clf = LinearSVC() clf.fit(X_train, y_train) if validation_mode == "train": X_test = X_train y_test = y_train # predict categories predicted = clf.predict(X_test)
'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(), 'SkewedChi2Sampler':SkewedChi2Sampler(), 'SparsePCA':SparsePCA(), 'SparseRandomProjection':SparseRandomProjection(), 'SpectralBiclustering':SpectralBiclustering(), 'SpectralClustering':SpectralClustering(), 'SpectralCoclustering':SpectralCoclustering(), 'SpectralEmbedding':SpectralEmbedding(), 'StandardScaler':StandardScaler(), 'TSNE':TSNE(), 'TheilSenRegressor':TheilSenRegressor(), 'VBGMM':VBGMM(), 'VarianceThreshold':VarianceThreshold(),}
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, max_features=200000, ngram_range=(1, 4), sublinear_tf=True) vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews) train_data_features = vectorizer.transform(clean_train_reviews) test_data_features = vectorizer.transform(clean_test_reviews) print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2, k=70000) train_data_features = fselect.fit_transform(train_data_features, train["sentiment"]) test_data_features = fselect.transform(test_data_features) print "Training..." model1 = MultinomialNB(alpha=0.0005) model1.fit(train_data_features, train["sentiment"]) model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True) model2.fit(train_data_features, train["sentiment"])
# return a scipy sparse term-document matrix print("Vectorizing input texts") train_vec = count_vec.fit_transform(train_list) test_vec = count_vec.transform(test_list) # Dimension Reduction if dim_reduce == "SVD": print("performing dimension reduction") svd = TruncatedSVD(n_components=num_dim) train_vec = svd.fit_transform(train_vec) test_vec = svd.transform(test_vec) print("Explained variance ratio =", svd.explained_variance_ratio_.sum()) elif dim_reduce == "chi2": print("performing feature selection based on chi2 independce test") fselect = SelectKBest(chi2, k=num_dim) train_vec = fselect.fit_transform(train_vec, train_data.sentiment) test_vec = fselect.transform(test_vec) # Transform into numpy arrays if "numpy.ndarray" not in str(type(train_vec)): train_vec = train_vec.toarray() test_vec = test_vec.toarray() # Feature Scaling if scaling != "no": if scaler == "standard": scaler = preprocessing.StandardScaler() else: if scaling == "unsigned": scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
for review in test['review']: clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review ))) print "Vectorizing..." vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ), sublinear_tf = True ) vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews) train_data_features = vectorizer.transform( clean_train_reviews ) test_data_features = vectorizer.transform( clean_test_reviews ) print "Reducing dimension..." from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif fselect = SelectKBest(chi2 , k=70000) train_data_features = fselect.fit_transform(train_data_features, train["sentiment"]) test_data_features = fselect.transform(test_data_features) print "Training..." model1 = MultinomialNB(alpha=0.0005) model1.fit( train_data_features, train["sentiment"] ) model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True) model2.fit( train_data_features, train["sentiment"] ) p1 = model1.predict_proba( test_data_features )[:,1] p2 = model2.predict_proba( test_data_features )[:,1] print "Writing results..."