def _sort_applicable_ngrams(self, list_of_ngrams, sentences, labels, spacy_nlp): """Given an intent classification problem and a list of ngrams, creates ordered list of most useful ngrams.""" if list_of_ngrams: from sklearn import linear_model, preprocessing import numpy as np usable_labels = [] for label in np.unique(labels): lab_sents = np.array(sentences)[np.array(labels) == label] if len(lab_sents) < min_intent_examples_for_ngram_classification: continue usable_labels.append(label) mask = [label in usable_labels for label in labels] sentences = np.array(sentences)[mask] labels = np.array(labels)[mask] X = np.array(self._ngrams_in_sentences(sentences, spacy_nlp, list_of_ngrams)) intent_encoder = preprocessing.LabelEncoder() intent_encoder.fit(labels) y = intent_encoder.transform(labels) clf = linear_model.RandomizedLogisticRegression(C=1) clf.fit(X, y) scores = clf.scores_ sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])] return np.array(list_of_ngrams)[sort_idx] else: return []
def RandomizedLogisticRegression(np_X, np_y): X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) estimator = linear_model.RandomizedLogisticRegression(n_jobs=1, n_resampling=500) estimator.fit(X, y) return estimator.scores_
def save_features(self, X, y): feats = dict() print "univariate feature selectors" selector_clf = SelectKBest(score_func=f_classif, k='all') selector_clf.fit(X, y) pvalues_clf = selector_clf.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 #put feature vectors into dictionary feats['univ_sub01'] = (pvalues_clf < 0.1) feats['univ_sub005'] = (pvalues_clf < 0.05) feats['univ_clf_sub005'] = (pvalues_clf < 0.05) print "randomized logistic regression feature selector" sel_log = linear_model.RandomizedLogisticRegression(random_state=42, n_jobs=4).fit( X, y) #put rand_lasso feats into feature dict feats['rand_logreg'] = sel_log.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(X) sel_svc = svm.LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42).fit(X, y) feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0) sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit( X_sp, y) feats['LogReg'] = np.ravel(sel_log.coef_ > 0) tree_max_features = 20 print "ExtraTrees feature selectors (%s)" % tree_max_features feats['tree'] = np.zeros(len(feats['LogReg'])) tree = ExtraTreesClassifier(n_estimators=250, max_features=tree_max_features) tree.fit(X, y) feature_importance = tree.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance)[::-1] for i in xrange(tree_max_features): feats['tree'][sorted_idx[i]] = 1 feat_sums = np.zeros(len(feats['LogReg'])) for key in feats: feat_sums += feats[key].astype(int) feats[ 'ensemble'] = feat_sums >= 4 #take features which get 5 or more votes joblib.dump(feats, 'features/feats.pkl', compress=3) return feats
def find_better_features(data, truth, regularization=1e5, number_renor_models=200): '''Resample the train data and compute a Logistic Regression on each resampling http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html''' model = linear_model.RandomizedLogisticRegression( C=regularization, n_resampling=number_renor_models) model = model.fit(data, truth) return model
def _sort_applicable_ngrams(self, list_of_ngrams, sentences, labels, spacy_nlp): """Given an intent classification problem and a list of ngrams, creates ordered list of most useful ngrams.""" if list_of_ngrams: from sklearn import linear_model, preprocessing import numpy as np # filter examples where we do not have enough labeled instances for cv usable_labels = [] for label in np.unique(labels): lab_sents = np.array(sentences)[np.array(labels) == label] if len(lab_sents ) < self.min_intent_examples_for_ngram_classification: continue usable_labels.append(label) mask = [label in usable_labels for label in labels] if any(mask) and len(usable_labels) >= 2: try: sentences = np.array(sentences)[mask] labels = np.array(labels)[mask] X = np.array( self._ngrams_in_sentences(sentences, spacy_nlp, list_of_ngrams)) intent_encoder = preprocessing.LabelEncoder() intent_encoder.fit(labels) y = intent_encoder.transform(labels) clf = linear_model.RandomizedLogisticRegression(C=1) clf.fit(X, y) scores = clf.scores_ sort_idx = [ i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1]) ] return np.array(list_of_ngrams)[sort_idx] except ValueError as e: if "needs samples of at least 2 classes" in str(e): # we got unlucky during the random sampling :( and selected a slice that only contains one class return [] else: raise e else: # there is no example we can use for the cross validation return [] else: return []
def _rank_ngrams_using_cv(self, examples, labels, list_of_ngrams) -> list: from sklearn import linear_model X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams)) y = self.encode_labels(labels) clf = linear_model.RandomizedLogisticRegression(C=1) clf.fit(X, y) # sort the ngrams according to the classification score scores = clf.scores_ sorted_idxs = sorted(enumerate(scores), key=lambda x: -1 * x[1]) sorted_ngrams = [list_of_ngrams[i[0]] for i in sorted_idxs] return sorted_ngrams
def run_regression(data_file, lead, lag): start_time = time.time() intermediate_file = "prediction/data/tmp.csv" flatten_featureset.create_features(intermediate_file, data_file, lead, lag) train_data = np.genfromtxt(intermediate_file, delimiter = ',', skip_header = 1) os.remove(intermediate_file) X_train = train_data[:,1:] #file format is [label list_of_features] Y_train = train_data[:,0] logreg = linear_model.RandomizedLogisticRegression() #n_jobs=12 logreg.fit(X_train, Y_train) return logreg.scores_
def trainModel( pairs, classes, train, drugFeatures, diseaseFeatures, drugFeatureNames, diseaseFeatureNames, model_type, model_fun, n_seed): clf= get_classification_model(model_type, model_fun, n_seed) classes = numpy.array(classes) pairs = numpy.array(pairs) pairs_train = pairs[train] classes_train = classes[train] X_train = createFeatureMat(pairs_train, classes_train, drugFeatures, diseaseFeatures, drugFeatureNames, diseaseFeatureNames, featureMatfile=None) #sel = VarianceThreshold() #print X_train.shape #X_train=sel.fit_transform(X_train) #print X_train.shape randomlr = linear_model.RandomizedLogisticRegression( C=1, random_state=n_seed, selection_threshold=0.1) #sfm = SelectFromModel(clf) randomlr.fit(X_train,classes_train) X_train = randomlr.transform(X_train) print "number of seleceted features",X_train.shape[1] joblib.dump(randomlr,"../data/models/randomlr.pkl") selectedFeatures=randomlr.get_support(indices=True) print selectedFeatures saveSelectedFeatures(drugFeatureNames, diseaseFeatureNames, selectedFeatures) y_train = numpy.array(classes_train) clf.fit(X_train, y_train) return clf
def main(): print "loading data.." traindata = (p.read_table('train.tsv')) tr_title, tr_body, tr_url = convert_text(traindata, 'boilerplate') testdata = p.read_table('test.tsv') ts_title, ts_body, ts_url = convert_text(testdata, 'boilerplate') y = np.array(p.read_table('train.tsv'))[:, -1] internetStopWords = [ 'http', 'www', 'online', 'com', 'jpg', 'static', 'link', 'terminal01', 'user', 'null', 'div', 'span', 'font', 'timestamp', 'content', 'blog' ] stopwords = ENGLISH_STOP_WORDS stopwords = list(stopwords) stopwords = stopwords + internetStopWords X_all = tr_body + ts_body + tr_title + ts_title #use for dummy variables urls = getURL(traindata['url']) #building the model tfv = TfidfVectorizer(min_df=3, stop_words=stopwords, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1) tfdif = wordTFIDF.fit_transform(corpus) Xt = tfdif[:lentrain] rnd = lm.RandomizedLogisticRegression() xrnd = rnd.fit_transform(Xt, y_train) # not working : #X_all = hstack( (xrnd,url) ) # tfv.build_analyzer() rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None) lentrain = len(traindata) print "fitting pipeline" tfv.fit(X_all) print "transforming data" X_all = tfv.transform(X_all) X = X_all[:lentrain] X_test = X_all[lentrain:] print "20 Fold CV Score: ", np.mean( cross_validation.cross_val_score(rd, X, y, cv=20, scoring='roc_auc'))
file_path = './data/bankloan.xls' bank_data = pd.read_excel(file_path) # print(bank_data.head()) """ 年龄 教育 工龄 地址 收入 负债率 信用卡负债 其他负债 违约 0 41 3 17 12 176 9.3 11.359392 5.008608 1 1 27 1 10 6 31 17.3 1.362202 4.000798 0 2 40 1 15 14 55 5.5 0.856075 2.168925 0 3 41 1 15 14 120 2.9 2.658720 0.821280 0 4 24 2 2 0 28 17.3 1.787436 3.056564 1 """ X = bank_data.iloc[:, :8] y = bank_data.iloc[:, 8] # 建立逻辑回归模型 帅选变量 RandomizedLogisticRegression 即将在0.21中弃掉 rlr = linear_model.RandomizedLogisticRegression() # 训练模型 rlr.fit(X, y) print(u'有效特征:%s' % ','.join(bank_data.columns[0:-1][rlr.get_support()])) # 建立逻辑货柜模型 # penalty='l2', n_jobs=-1, solver='sag' lr = linear_model.LogisticRegression(solver='lbfgs') X = bank_data[bank_data.columns[0:-1][rlr.get_support()]] lr.fit(X, y) # (700, 4) print(X.shape) # 模型的平均正确率: 0.8142857142857143 print(u'模型的平均正确率:%s' % lr.score(X, y))
try: tokens[vocabulary[w]] += 1 except KeyError: pass samples.append(tokens) train_samples = samples[:len(samples) - 1000] train_labels = labels[:len(labels) - 1000] test_samples = samples[-1000:] test_labels = labels[-1000:] print len(labels), len(samples), len(samples[0]) if use_randomlogreg: logreg = linear_model.RandomizedLogisticRegression(n_resampling=150) logreg.fit(train_samples, train_labels) indices = logreg.get_support(indices=True) swapped_vocab = dict((v, k) for k, v in vocabulary.iteritems()) print ", ".join([swapped_vocab[i] for i in indices]) exit() else: logreg = linear_model.LogisticRegression() logreg.fit(train_samples, train_labels) print "Done" predictions = logreg.predict(test_samples)
cols = list(df_logit.columns.values) cols.remove('SUBJECT') cols.remove('CLASS') cols.remove('AGE') X = df_logit[cols] y = df_logit.CLASS # In[67]: import warnings # sklearn is using a deprecated rand function here, with warnings.catch_warnings(): # and warnings clutter output warnings.simplefilter("ignore") resamplings = 2000 rlogit = linear_model.RandomizedLogisticRegression( n_resampling=resamplings) rlogit.fit(X, y) print( "Features sorted by score, using {} resamplings: ".format(resamplings)) feature_list = sorted(zip(map(lambda x: round(x, 4), rlogit.scores_), cols), reverse=True) for f in feature_list[ 0:25]: # Adjust this if last feature output is nonzero print("{}:\t\t\t{:.2f}".format(f[1], f[0])) # ### Entire dataset, LASSO for age as interest variable. # In[68]: X, y = df[cols], df.AGE
pass print "Finished loading samples" l1 = int(len(samples) * test_size) # train_samples = samples[:l1] # train_labels = labels[:l1] # test_samples = samples[-l1:] # test_labels = labels[-l1:] print len(labels), len(samples), len(samples[0]) if use_randomlogreg: print "Running ranlogreg" logreg = linear_model.RandomizedLogisticRegression( n_resampling=200, selection_threshold=0.25) print "Fitting" logreg.fit(samples, labels) samples = None labels = None print "Swapping vocab" indices = logreg.get_support(indices=True) swapped_vocab = dict((v, k) for k, v in vocabulary.iteritems()) print ", ".join([swapped_vocab[i] for i in indices]) exit() else: logreg = linear_model.LogisticRegression() logreg.fit(train_samples, train_labels)
def bag_feature_selection(X, y, feature_names, CVobj, K_best=None, pct_best=None, percentiles=np.linspace(10,100,10), Cs_l1=np.logspace(-4, -1, 7), C_l2=0.01, rand_L1_params=None, RF_params=None, GBM_params=None, scaling=None, rand_seed=1234, plot=True, save_fig=False, fig_names=None, show=True): """ Bag four different types of feature selection process. 1. Univariate feature selection (f-score based ranking) 2. Recursive feature elimination 3. L1-based feature selection 4. Tree-based feature selection For 1,2,3, logistic regression and linear SVM are used as base models For 4, random forest and gradient tree boosting are used as base models For each process, a plot of (cross-validation) accuracy vs. number of selected features (or regularization parameter for L1-based methods) are given Input: CVobj: cross validation object K_best: K best features to keep pct_best: Percent of features to keep (If both K_best and pct_best are provided, pct_best will be suppressed.) percentiles: a list of percents of features to keep and search over Cs_l1: a list of C values (1/regularization parameters) to be searched over C_l2, RF_params, GBM_param: default parameter for base models (logistic, svm, forest) fig_names: a dict where keys and items are figure category and the corresponding filenames to save Return a dict containing ranked features from univariate feature selection, recursive feature elimination (a list where each element corresponds to a model), and tree-based (a list where each element corresponds to a model) """ p = X.shape[1] if K_best is not None: n_selected = K_best elif pct_best is not None: n_selected = int(pct_best*0.01*p) else: n_selected = p if scaling is not None: if scaling=='standard': X = preprocessing.StandardScaler().fit_transform(X) elif scaling=='minmax': X = preprocessing.MinMaxScaler().fit_transform(X) ranked_features = {} feature_ranks = {} # Note: The combination of penalty='l2' and loss='hinge' are not supported when dual=False # clfs = [linear_model.LogisticRegression(C=C_l2, penalty='l2'), svm.LinearSVC(C=C_l2, penalty='l2', loss='squared_hinge')] # clf_names = ['logistic regression','linear SVM'] clfs = [linear_model.LogisticRegression(C=C_l2, penalty='l2')] clf_names = ['logistic regression'] # ******************* Univariate feature selection ******************* # univar_scores= fs.univar_score(X, y, feature_names, K_best=n_selected, criterion='f_score',\ plot=plot, save_fig=save_fig, fig_name=fig_names['univar_rank'],show=show) ranked_features['univar'], feature_ranks['univar'] = univar_scores['ranked_features'], univar_scores['ranks'] fs.univar_FS_cv(X, y, clfs, clf_names, CVobj, percentiles=percentiles, criterion='f_score',\ plot=plot, save_fig=save_fig, fig_name=fig_names['univar_CV'], show=show); # ******************* Recursive feature elimination ******************* # RFE_scores = fs.RFE_rank(X, y, clfs, feature_names, K_best=n_selected) ranked_features['RFE'], feature_ranks['RFE'] = RFE_scores['ranked_features'], RFE_scores['ranks'] # # The following code takes very long time to run # n_keep = np.round(percentiles*0.01*p).astype(int) # fs.RFE_FS_cv(X, y, clfs, clf_names, CVobj, rm_per_step=1, n_keep=n_keep,\ # plot=plot, save_fig=save_fig, fig_name=fig_names['RFE_CV'],show=show); #******************* L1-based feature selection *************************# if rand_L1_params is None: rand_L1_params = dict(C=Cs_l1, scaling=0.5, sample_fraction=0.75, n_resampling=100, selection_threshold=0.25, random_state=rand_seed, n_jobs=1) rand_L1 = linear_model.RandomizedLogisticRegression(**rand_L1_params) L1_scores = fs.L1_score(X, y, [rand_L1], ['randomized sparse model'], feature_names, K_best=n_selected, criterion='mean',\ plot=plot, save_fig=save_fig, fig_name=fig_names['L1_rank'], show=show) ranked_features['L1'], feature_ranks['L1'] = L1_scores['ranked_features'], L1_scores['ranks'] # NOTE: For L1 logistic regression,‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty. clfs[0].set_params(**{'penalty':'l1','solver':'liblinear'}) # clfs[1].set_params(**{'penalty':'l1','dual':False}) fs.plot_L1_path(X, y, clfs, clf_names, Cs_l1, save_fig=save_fig, fig_name=fig_names['L1_path'], show=show); fs.L1_FS_cv(X, y, clfs, clf_names, CVobj, Cs_l1, \ plot=plot, save_fig=save_fig, fig_name=fig_names['L1_CV'], show=show); # **************** Tree-based feature selection ************************ # # For tree-based methods, raw features values could be used if RF_params is None: RF_params = {'n_estimators': 1000,'max_features': 'auto','min_samples_split': 1, 'bootstrap': True, 'oob_score': True, 'random_state': rand_seed, 'n_jobs': -1} if GBM_params is None: GBM_params = {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'subsample': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'random_state': rand_seed} RF = ensemble.RandomForestClassifier(**RF_params) GBM = ensemble.GradientBoostingClassifier(**GBM_params) clfs = [RF, GBM] clf_names = ['Random Forest','Gradient Boosting Machine'] tree_scores = fs.tree_score(X, y, clfs, clf_names, feature_names, K_best=n_selected, plot=plot,\ save_fig=save_fig, fig_name=fig_names['feature_imp'], show=show) ranked_features['tree'], feature_ranks['tree'] = tree_scores['ranked_features'], tree_scores['ranks'] # # DO NOT DO THIS. See doctring of function fs.tree_FS_cv # fs.tree_FS_cv(X, y, clfs, clf_names, CVobj, percentiles, plot=plot,\ # save_fig=save_fig, fig_name=fig_names['tree_CV'], show-show) return ranked_features, feature_ranks
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)