def pre_data_flow(flag): """ 将分割为流后的数据集进行一个数据预处理 :param flag: 返回什么特征集合 :return: 返回的特征集合 """ dataset_train_b = np.load("feature_flow/train_black.npy", allow_pickle=True) dataset_train_w = np.load("feature_flow/train_white.npy", allow_pickle=True) dataset_test_b = np.load("feature_flow/test_black.npy", allow_pickle=True) dataset_test_w = np.load("feature_flow/test_white.npy", allow_pickle=True) dataset_train = np.vstack((dataset_train_b, dataset_train_w)) dataset_test = np.vstack((dataset_test_b, dataset_test_w)) dataset = np.vstack((dataset_train, dataset_test)) # 前6000 训练集合,后4000测试集合 ip = [] subject = [] issue = [] cipher_version = [] label = [] matrix = [] for key in dataset: if (key[-5] != 0): if key[-2] == 'black': label.append(0) elif key[-2] == 'white': label.append(1) else: label.append(1) ip.append(key[3]) max_cip_version = 0 # for tem in key[-11]: # try: # if int(tem) > max_cip_version: # max_cip_version = int(tem) # except ValueError: # max_cip_version = -1 cipher_version.append(max_cip_version) subject.append(Find_first(key[53])) issue.append(Find_first(key[54])) # print(key[-3].reshape(1,-1)) # print(key[-3].flatten()) matrix.append(key[-9].flatten()) ip_ans = oh_encoding(ip) subject_ans = oh_encoding(subject) issue_ans = oh_encoding(issue) dataset_flow = [] mean_list = [8, 12, 16, 20, 23, 26, 29, 32] from sklearn.feature_selection import VarianceThreshold for i in range(len(dataset)): feature = [] if dataset[i][-5] != 0: for j in range(0, 3): feature.append(float(dataset[i][j])) for j in range(4, 51): feature.append(float(dataset[i][j])) # for j in range(4, 6): # feature.append(float(dataset[i][j])) # for j in mean_list: # feature.append(float(dataset[i][j])) feature.append(int(find_min((dataset[i][52])))) # certificate_time feature.append(find_self_signed((dataset[i][51]))) # 自签名 dataset_flow.append(feature) from sklearn.preprocessing import MinMaxScaler select = VarianceThreshold(threshold=0) dataset_flow = select.fit_transform(dataset_flow) minMax = MinMaxScaler() dataset_flow = minMax.fit_transform(dataset_flow) dataset_mix = (np.hstack((dataset_flow, subject_ans, issue_ans, matrix))) # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i])) print("dataset is formed by {}".format(flag)) dataset_mix = select.fit_transform(dataset_mix) num = len(dataset_train_b) + len(dataset_train_w) if flag == 'flow': return dataset_flow[:num], dataset_flow[num:], label[:num], label[num:] elif flag == 'subject': return subject_ans[:num], subject_ans[num:], label[:num], label[num:] elif flag == 'issue': return issue_ans[:num], issue_ans[num:], label[:num], label[num:] elif flag == 'matrix': return matrix[:num], matrix[num:], label[:num], label[num:] elif flag == 'mix': return dataset_mix[:num], dataset_mix[num:], label[:num], label[num:] else: print("select wrong")
cpca = (pca_c.fit(data[CELLS])) train2 = (cpca.transform(train_features[CELLS])) test2 = (cpca.transform(test_features[CELLS])) train_cpca = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)]) test_cpca = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)]) train_features = pd.concat((train_features, train_cpca), axis=1) test_features = pd.concat((test_features, test_cpca), axis=1) dump(cpca, open('cpca.pkl', 'wb')) print('pca done') from sklearn.feature_selection import VarianceThreshold var_thresh = VarianceThreshold(0.85) #<-- Update data = train_features.append(test_features) data_transformed = var_thresh.fit_transform(data.iloc[:, 4:]) train_features_transformed = data_transformed[:train_features.shape[0]] test_features_transformed = data_transformed[-test_features.shape[0]:] train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\ columns=['sig_id','cp_type','cp_time','cp_dose']) train_features = pd.concat( [train_features, pd.DataFrame(train_features_transformed)], axis=1) test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
S_pca = pca.fit_transform(features) ica = FastICA(n_components=3) S_ica = ica.fit_transform(features) rpg = random_projection.GaussianRandomProjection(n_components=3) g_rpg = rpg.fit_transform(features) spg = random_projection.SparseRandomProjection(n_components=3) s_rp = spg.fit_transform(features) threshold = [ .01, .02, .03, .04, .05, .1, .20, .25, .30, .4, .5, .6, .7, .8, .9, 1 ] lvf = VarianceThreshold() t_lvf = lvf.fit_transform(X_train) components = range(1, 31) model = LinearSVC() model.fit(X_train, y_train) baseline = metrics.accuracy_score(model.predict(X_calibrate), y_calibrate) acc = [] def lowV(): for thresh in threshold: lvf = VarianceThreshold(threshold=thresh) spD = lvf.fit_transform(X_train) model = LinearSVC() model.fit(spD, y_train)
msg_to_poi = my_dataset[person]['from_this_person_to_poi'] from_msg = my_dataset[person]['from_messages'] if msg_to_poi != "NaN" and from_msg != "NaN": my_dataset[person]['msg_to_poi_ratio'] = msg_to_poi / float(from_msg) else: my_dataset[person]['msg_to_poi_ratio'] = 0 new_features_list = features_list + ['msg_to_poi_ratio', 'msg_from_poi_ratio'] ## Extract features and labels from dataset for local testing data = featureFormat(my_dataset, new_features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #Select the best features: #Removes all features whose variance is below 80% from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(.8 * (1 - .8))) features = sel.fit_transform(features) #Removes all but the k highest scoring features from sklearn.feature_selection import f_classif k = 7 selector = SelectKBest(f_classif, k=7) selector.fit_transform(features, labels) print("Best features:") scores = zip(new_features_list[1:], selector.scores_) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) print sorted_scores optimized_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[0:k] print(optimized_features_list)
# create a Random Forest Classifier print('creating a model...') # create a tree to select features tree_cat = RandomForestRegressor(n_jobs=n_jobs, random_state=1, n_estimators=10, max_features='sqrt', max_depth=10) tree_cont = RandomForestRegressor(n_jobs=n_jobs, random_state=1, n_estimators=10, max_features='sqrt', max_depth=10) # some feature selection print('selecting features...') # use variance threshold to select features # many of the features are in categories with few vars selector_variance_cat = VarianceThreshold(threshold=0.1) X_cat = selector_variance_cat.fit_transform(X_cat) print('shape of X_cat after variance threshold') print(X_cat.shape) # create a basic tree for continuous features print('fitting tree to continuous data...') tree_cont.fit(X_cont, y) feature_importances_cont = tree_cont.feature_importances_ feature_mapping_cont = {importance:idx for idx, importance in \ enumerate(feature_importances_cont)} sorted_features_cont = feature_importances_cont.argsort() sorted_indices_cont = [] print(sorted_features_cont) for x in sorted_features_cont[:num_features_cont]: sorted_indices_cont.insert(0, x)
def variance_three_shold(X): sel = VarianceThreshold(threshold=(.8 * (1 - .8))) return sel.fit_transform(X)
from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100))] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore') for (preprocessor, n_estimators, min_weight_fraction_leaf, max_features, criterion) in itertools.product( preprocessor_list, [10, 50, 100, 500, 1000], np.arange(0., 0.51, 0.05), [0.1, 0.25, 0.5, 0.75, 'sqrt', 'log2', None],
def transform(self, X): data = X.copy() selector = VarianceThreshold(threshold=self._threshold) selector.fit(data) return data[data.columns[selector.get_support(indices=True)]]
def __init__(self, conf): UnsupervisedFeatureSelection.__init__(self, conf) self.projection = VarianceThreshold()
def variance_thresholding(self, dataset): selector = VarianceThreshold(threshold=.02) selector.fit(dataset) return selector.get_support(indices=True)
x_columns = X.columns x_dtypes = X.dtypes x_str = np.where(x_dtypes == "object")[0] # convert any string columns to binary columns X = pd.get_dummies(X, columns=x_columns[x_str]) # In[1]: Model the data # set up cross validation for time series tscv = TimeSeriesSplit(n_splits=5) folds = tscv.get_n_splits(X) # set up a machine learning pipeline pipeline = Pipeline([ ('var', VarianceThreshold()), ('scale', MinMaxScaler()), ('model', RandomForestRegressor(n_estimators=50)), ]) # set up the grid search parameters = { 'model__max_depth': [6, 9, 12, 15], 'model__min_samples_leaf': [1, 3, 5, 7], } grid_search = GridSearchCV(pipeline, parameters, cv=folds, n_jobs=-1, verbose=0)
def __init__(self, data_set_file, lowercase=False, use_idf=False, developers_dict_file=None, developers_list_file=None): """Constructor The data in the data set file are loaded. The pre-processing techniques, the feature extraction techniques and the feature selection techniques to use are selected. :param data_set_file: The absolute path of the data set file. :type data_set_file: string. :param lowercase: To decide whether or not conversion to lower case should be applied. :type lowercase: boolean. :param use_idf: To decide whether or not the inverse document frequencies of the tf-idf formula should be used. :type use_idf: boolean. :param developers_dict_file: The absolute path of a JSON file containing a mapping of developers names to other strings. Not implemented yet: it should be done later if needed. :type developers_dict_file: string. :param developers_list_file: The absolute path of a JSON file allowing us to filter out the data set based on the names of the developers. Not implemented yet: it should be done later if needed. :type developers_list_file: string. """ super().__init__(developers_dict_file, developers_list_file) np.random.seed(0) # We set the seed self.lowercase = lowercase self.use_idf = use_idf self._pre_processing_steps = [("count", CountVectorizer( \ lowercase=lowercase, token_pattern=u"(?u)\S+")), \ ("tf_idf", TfidfTransformer(use_idf=use_idf, smooth_idf=False))] self._feature_selection_methods = [ ("var_threshold", VarianceThreshold()), ("chi2", SelectPercentile(chi2)), ("f_classif", SelectPercentile(f_classif)), ("mutual_info_classif", SelectPercentile(mutual_info_classif)) ] self._feature_selection_methods_params = [ dict(var_threshold__threshold=[(( (1 - 0)**2) / 12) * i for i in np.arange(0.01, 0.1, 0.02)]), dict(chi2__percentile=list(range(10, 100, 20))), dict(f_classif__percentile=list(range(10, 100, 20))), dict(mutual_info_classif__percentile=list(range(10, 100, 20))) ] self._classifiers_estimators = { \ "Linear SVM": [("clf", LinearSVC(random_state=0))], \ # "MultinomialNB": [("clf", MultinomialNB())], # "LogisticRegression": [("clf", LogisticRegression(random_state=0, class_weight="balanced", multi_class="multinomial", n_jobs=-1))] \ } # Below, there is a dictionary to store the names, the # classifiers used, the parameters sent to the constructor of # the classifiers and the fitted classifiers self._models_cv = {} # Below, there is a dictionary to store the names, the pipelines # used, the parameters sent to the constructor of the feature # selection techniques self._rfe_cv = { "RFECV SVM": [RFECV, { "estimator": LinearSVC(random_state=0), "step": 0.1, "cv": self._tscv, "scoring": accuracy_mrr_scoring_object, "verbose": 10, "n_jobs": -1 }, None], \ # "RFECV Naive Bayes": [RFECV, { # "estimator": MultinomialNB(), # "step": 0.1, # "cv": self._tscv, # "verbose": 1, # "n_jobs": -1 # }, None] } for key, classifier_estimator in self._classifiers_estimators.items(): for i, feature_selection_method in enumerate( self._feature_selection_methods): self._models_cv["GridSearch " + feature_selection_method[0] + " " + key] = [GridSearchCV, { \ "estimator": Pipeline(self._pre_processing_steps + [feature_selection_method] + classifier_estimator), \ "param_grid": self._feature_selection_methods_params[i], \ "n_jobs": -1, \ "iid": False, \ "cv": self._tscv, \ "verbose": 10, \ "error_score": np.array([-1, -1]), \ "scoring": accuracy_mrr_scoring_object }, None] cleaned_results_file_name = "cleaned_feature_selection_" + \ "experiment_results.json" self._cleaned_results_file_name = os.path.join( \ self._current_dir, cleaned_results_file_name) self._data_set_file = os.path.join(self._current_dir, \ data_set_file) log_file = os.path.join(self._current_dir, \ "feature_selection_experiment.log") logging.basicConfig(filename=log_file, filemode="w", \ level=logging.DEBUG) self._build_data_set()
def feature_selection(X, p): sel = VarianceThreshold(threshold=p * (1 - p)) print('before feature selection: {} features'.format(X.shape[1])) X_after_feature_selection = sel.fit_transform(X) print('after feature selection: {} features'.format(X_after_feature_selection.shape[1])) return X_after_feature_selection,sel.get_support(indices=True)
def pre_data_1(): dataset_b, dataset_w = data_read() dataset_train = dataset_b[:3000] + dataset_w[:3000] dataset_test = dataset_b[3000:] + dataset_w[3000:] dataset = dataset_train + dataset_test # 前6000 训练集合,后4000测试集合 ip = [] subject = [] issue = [] cipher_version = [] label = [] matrix = [] for key in dataset: if key[-2] == 'black': label.append(1) elif key[-2] == 'white': label.append(0) else: label.append(0) ip.append(key[3]) max_cip_version = 0 for tem in list_string(key[-11]): try: if int(tem) > max_cip_version: max_cip_version = int(tem) except ValueError: max_cip_version = -1 cipher_version.append(max_cip_version) subject.append(find_first(list_string(key[53]))) issue.append(find_first(list_string(key[54]))) print(list_string(key[-3])) matrix.append(list_string(key[-3])) ip_ans = oh_encoding(ip) subject_ans = oh_encoding(subject) issue_ans = oh_encoding(issue) dataset_flow = [] mean_list = [8, 12, 16, 20, 23, 26, 29, 32] from sklearn.feature_selection import VarianceThreshold for i in range(len(dataset)): feature = [] for j in range(0, 3): feature.append(float(dataset[i][j])) for j in range(4, 51): feature.append(float(dataset[i][j])) # for j in range(4, 6): # feature.append(float(dataset[i][j])) # for j in mean_list: # feature.append(float(dataset[i][j])) feature.append(int(find_min(list_string(dataset[i][52])))) # certificate_time feature.append(find_self_signed(list_string(dataset[i][51]))) # 自签名 dataset_flow.append(feature) from sklearn.preprocessing import MinMaxScaler select = VarianceThreshold(threshold=0) dataset_flow = select.fit_transform(dataset_flow) minMax = MinMaxScaler() dataset_flow = minMax.fit_transform(dataset_flow) dataset_mix = [] for i in range(len(dataset)): dataset_mix.append(np.hstack((dataset_flow[i], subject_ans[i]))) # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i])) print("dataset is formed by {}".format("mixed")) dataset_mix = select.fit_transform(dataset_mix)
x_train,x_test,y_train,y_test = train_test_split(data.drop("status",axis=1),data["status"],random_state=2018,test_size=0.3) # 把其中的非数字型转为one-hot x_train = x_train.to_dict(orient="records") x_test = x_test.to_dict(orient="records") trans = DictVectorizer() x_train = trans.fit_transform(x_train) x_test = trans.transform(x_test) # 标准化 trans = StandardScaler(with_mean=False) x_train = trans.fit_transform(x_train) x_test = trans.transform(x_test) # 过滤低方差特征 trans = VarianceThreshold(threshold=1) x_train = trans.fit_transform(x_train) x_test = trans.transform(x_test) print(x_train.shape) estimator = RandomForestClassifier(n_estimators=200,max_depth=80) # estimator = GradientBoostingClassifier(random_state=10) # estimator = KNeighborsClassifier(n_neighbors=50) # estimator = LogisticRegression() # estimator = XGBClassifier(learning_rate=0.01, # n_estimators=200, # 树的个数-10棵树建立xgboost # max_depth=30, # 树的深度 # min_child_weight = 1, # 叶子节点最小权重 # gamma=0., # 惩罚项中叶子结点个数前的参数 # subsample=1, # 所有样本建立决策树 # colsample_btree=1, # 所有特征建立决策树
from sklearn.ensemble import GradientBoostingRegressor from sklearn.feature_selection import VarianceThreshold from sklearn.linear_model import RidgeCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.9061619988129817 exported_pipeline = make_pipeline( StackingEstimator(estimator=RidgeCV()), VarianceThreshold(threshold=0.001), GradientBoostingRegressor(alpha=0.8, learning_rate=0.1, loss="huber", max_depth=4, max_features=1.0, min_samples_leaf=18, min_samples_split=3, n_estimators=100, subsample=0.6500000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
'optics': OPTICS()}) #methods.update({'dbscan_e{}'.format(i): DBSCAN(eps=i/10) for i in range(1, 10, 1)}) metrics = {'silhouette_score': silhouette_score, 'davies_bouldin_score': davies_bouldin_score, 'calinski_harabasz_score': calinski_harabasz_score} clusters = {k: [] for k in methods.keys()} metric_measures = pd.DataFrame(columns=list(methods.keys()), index=list(metrics.keys())) data = pd.read_csv('player_processed.csv', index_col=0).dropna(how="all").fillna(0) scaler = RobustScaler() selector = VarianceThreshold(MIN_STD) reductor = SparseRandomProjection(N_FEATURES, random_state=1) scaled = scaler.fit_transform(data) scaled[scaled > 10] = 10 scaled[scaled < -10] = -10 data_scaled = pd.DataFrame(scaled, columns=data.columns, index=data.index) data_scaled.fillna(0, inplace=True) data_selected = selector.fit_transform(data_scaled) data_selected = pd.DataFrame(data_selected, columns=selector.transform(data_scaled.columns.values.reshape(-1, 1).T).flatten(), index=data.index) corr_matrix = data_selected.corr().abs()
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier from sklearn.feature_selection import VarianceThreshold from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( VarianceThreshold(threshold=0.24), ExtraTreesClassifier(criterion="entropy", max_features=0.16, n_estimators=500)) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
else: plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center') wine = datasets.load_wine() X = wine.data y = wine.target #In general a good idea is to scale the data scaler = StandardScaler() scaler.fit(X) X=scaler.transform(X) pca = PCA() ica = FastICA() rp = GaussianRandomProjection(n_components=8) fs = VarianceThreshold(threshold=0.1) x_pca = pca.fit_transform(X) x_ica = ica.fit_transform(X) x_rp = rp.fit_transform(X) x_fs = fs.fit_transform(X) fig = plt.figure() # plt.xlim(-1,1) # plt.ylim(-1,1) plt.xlabel("PC{}".format(1)) plt.ylabel("PC{}".format(2)) plt.grid() #Call the function. Use only the 2 PCs. myplot(x_pca[:,0:2],np.transpose(pca.components_[0:2, :]))
def main(): result = {} for _sym in SYMBOLS: dataset = 'data/result/datasets/csv/{}.csv'.format(_sym) df = pd.read_csv(dataset, sep=',', encoding='utf-8', index_col='Date', parse_dates=True) df = df.replace([np.inf, -np.inf], np.nan).dropna() X = df[df.columns.difference(['target', 'target_pct', 'target_label'])] y = df['target'] #print("======"+_sym+"======") #print(X.info()) # Variance Threshold sel = VarianceThreshold() sel.fit_transform(X) sup = sel.get_support() X = X[[name for flag, name in zip(sup, X.columns) if flag]] ## SelectKBest sel = SelectKBest(chi2, k=30) sX = scale(X, scaler='minmax') sel.fit_transform(sX, y) sup = sel.get_support() sX = sX[[name for flag, name in zip(sup, sX.columns) if flag]] ## Recursive Feature Elimination # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications # model = SVC(kernel="linear") # rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring='accuracy', n_jobs=-1, verbose=1) # rfecv.fit(X, y) # X = X[[name for flag, name in zip(rfecv.support_, X.columns) if flag]] ### Genetic # estimator = MLPClassifier(**{ # 'hidden_layer_sizes': (10, 4), # 'solver': 'lbfgs', # 'learning_rate': 'constant', # 'learning_rate_init': 0.001, # 'activation': 'logistic' # }) estimator = LogisticRegression(solver="liblinear", multi_class="ovr") gscv = GeneticSelectionCV(estimator, cv=2, verbose=1, scoring="accuracy", max_features=30, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=80, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) gscv = gscv.fit(X, y) X = X[[name for flag, name in zip(gscv.support_, X.columns) if flag]] #print(X.columns) # print("[%s] Optimal number of features : %d Set: %s" % (_sym, rfecv.n_features_, ', '.join(X.columns))) # plt.figure() # plt.title(_sym + ' SVC RFECV K=2') # plt.xlabel("Number of features selected") # plt.ylabel("Cross validation score (nb of correct classifications)") # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.show() logger.info("{}: {}".format(_sym, X.columns)) result[_sym] = { 'dataset': dataset, 'columns_genetic_lr_30': [c for c in X.columns], 'columns_kbest_30': [c for c in sX.columns] } return result
pd_train_dataset_reduced['TARGET'], test_size=0.3, random_state=0) print("\t*) TRAIN DATASET DIMENSION") print(X_train.shape) print("\t*) TEST DATASET DIMENSION") print(X_test.shape) ### 3. Using variance threshold from sklearn ''' Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples. ''' sel = VarianceThreshold(threshold=0) sel.fit(X_train) # fit finds the features with zero variance # get_support is a boolean vector that indicates which features are retained # if we sum over get_support, we get the number of features that are not constant print( "*) Number of feature that are NOT CONSTANT using get_support() on VarianceThreshold" ) print(sum(sel.get_support())) # another way of finding non-constant features is like this print( "*) Number of feature that are NOT CONSTANT using get_support() on train columns dataset" ) print(len(X_train.columns[sel.get_support()]))
def main(): # Get data X_train, y_train_log, X_test, id_test = get_input(debug) # Remove constant columns variance_checker = VarianceThreshold(threshold=0.0) xtrain = variance_checker.fit_transform(X_train) xtest = variance_checker.transform(X_test) # Remove duplicate columns unique_transformer = UniqueTransformer() unique_transformer.fit(X_train) xtrain = unique_transformer.transform(X_train) xtest = unique_transformer.transform(X_test) # Define feature union data_union = FeatureUnion([ ('pca', PCA(n_components=100)), ('ct-2', ClassifierTransformer(get_rfc(), n_classes=2, cv=5)), ('ct-3', ClassifierTransformer(get_rfc(), n_classes=3, cv=5)), ('ct-4', ClassifierTransformer(get_rfc(), n_classes=4, cv=5)), ('ct-5', ClassifierTransformer(get_rfc(), n_classes=5, cv=5)), ('st', StatsTransformer(verbose=2)) ]) # Transform data data_union.fit(X=xtrain, y=y_train_log) print '\nCreating processed training set...\n' train_data = data_union.transform(xtrain) print '\nCreating processed test set...\n' test_data = data_union.transform(xtest) # Scale data xdata = np.concatenate([train_data, test_data], axis=0) scaler = StandardScaler() xdata_scaled = scaler.fit_transform(X=xdata) train_scaled = xdata_scaled[:len(X_train), :] test_scaled = xdata_scaled[len(X_train):, :] # Load KLIEP importance weights if debug: cs_path = './covariate_shift/debug_cs_weights_v1/' else: cs_path = './covariate_shift/full_cs_weights_v1/' cs_temp = '0_width%s_numk%s.pickle'%(gw_val, num_kernels) weight_path = cs_path + cs_temp if os.path.exists(weight_path): kliep_set = load_pickle(weight_path) weights = np.array(kliep_set['weights']) # Train XGBoost Regressor # Custom objective function for modified ordinary least squares def kliep_objective(y_true, y_pred): # Get split indexes split_list = copy.deepcopy(kf_list) target_idx = split_list[cv_counter][0] # Calculate 1st and 2nd derivatives grad = np.multiply(weights[target_idx], np.subtract(y_pred, y_true)) hess = weights[target_idx] return grad, hess # Custom evaluation function for RMSLE def rmsle_eval(y_predicted, y_true): labels = y_true.get_label() pred = np.log1p(y_predicted) real = np.log1p(labels) err = np.subtract(pred, real) return 'rmsle', np.sqrt(np.mean(np.square(err))) # XGBoost regressor parameters xgb_params = {'n_estimators': 1000, 'objective': kliep_objective, 'booster': 'gbtree', 'learning_rate': 0.02, 'max_depth': 22, 'min_child_weight': 57, 'gamma' : 1.45, 'alpha': 0.0, # No regularization 'lambda': 0.0, # No regularization 'subsample': 0.67, 'colsample_bytree': 0.054, 'colsample_bylevel': 0.50, 'n_jobs': -1, 'random_state': 456} # Fitting XGB Regressor parameters fit_params = {'early_stopping_rounds': 15, 'eval_metric': rmsle_eval, 'verbose': False} # Define KFold split kf_split = KFold(n_splits=cv_val, shuffle=False, random_state=random_state).split(train_scaled, y_train_log) kf_list = list(kf_split) # Train xgboost regressor reg_kliep = XGBRegressorCV_KLIEP(xgb_params=xgb_params, fit_params=fit_params) reg_kliep.fit(X=train_scaled, y=y_train_log, kf_list=kf_list) # Get predictions y_pred_log = reg_kliep.predict(X=test_scaled) y_pred = np.expm1(y_pred_log) # Format submission submission_path = '../submissions/xgb_kliep_1v0_submit.csv' submission = pd.DataFrame() submission['ID'] = id_test submission['target'] = y_pred # Save submissions submission.to_csv(submission_path, index=False)
import numpy as np from sklearn.datasets import load_iris # 载入iris数据集 data = load_iris() # 数据分布 X = data['data'] y = data['target'] # ## 代码开始 n, d = X.shape # 样本个数 特征个数 means = np.mean(X) # 数据X的各特征均值 stds = np.std(X) # 数据X的个特征标准差 # ## 代码结束 print('样本个数为:' + str(n) + '\n特征个数为:' + str(d)) print('样本各特征均值为:\n') print(means) print('样本各特征方差为:\n') print(stds**2) # 根据方差特征选择 from sklearn.feature_selection import VarianceThreshold # ## 代码开始 sel = VarianceThreshold(threshold=0.6) # 选择0.6作为方差阈值 X_new = sel.fit_transform(X) # 经过选择后的特征 # ## 代码结束 stds_new = np.std(X_new, axis=0) print('选择特征后样本各特征方差为:\n') print(stds_new**2)
pipeline_categorical = Pipeline([ ('selector_categorical', ColumnExtractor(columns=categorical_columns)), ('imputer_missing_values', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), ('ClassicMultipleBinarizer', ClassicMultipleBinarizer()) ]) #%% union of pipelines pipeline_features = FeatureUnion([('pipeline_numerical', pipeline_numerical), ('pipeline_categorical', pipeline_categorical)]) #%% pipeline union pipeline_union = Pipeline([ ('preprocessed_data', pipeline_features), ('feature_selection', VarianceThreshold()), ('feature_extraction', PCA(n_components=20)), #11)), ('scaler', StandardScaler()) ]) data_procesada = pipeline_union.fit_transform(df) #%% use RandomizedSearchCV and select the best estimator param_dist_random = { "max_depth": [3, None], "max_features": sp_randint(1, 20), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"],
RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) # load data train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') ids_tr = train.pop('id').values ids_te = test.pop('id').values magic_tr = train.pop('wheezy-copper-turtle-magic').values magic_te = test.pop('wheezy-copper-turtle-magic').values target = train.pop('target').values train = train.values test = test.values # infomative columns of each magic value vt = VarianceThreshold(threshold=1.5) infomative_cols = [] for i in range(512): vt.fit(train[magic_tr == i]) infomative_cols.append(vt.get_support(indices=True)) ### Step-1 ### oof_all = [] pred_all = [] for n in range(1, MAX_COMPONENTS + 1): oof_n = np.zeros(len(train)) pred_n = np.zeros(len(test)) gmm0 = GaussianMixture(n_components=n, covariance_type='full', random_state=RANDOM_SEED) gmm1 = GaussianMixture(n_components=n,
total = total + float(tp + tn) / (tp + tn + fp + fn) * 100 return total / len(labels) # train_text,train_classfi_number,train_classfi,train_feature_name = getTargetData("Breast_train.data") # test_text,test_classfi_number,test_classfi,test_feature_name = getTargetData("Breast_test.data") # for i in range(len(train_text)): # for j in range(len(train_text[0])): # train_text[i][j] = float(train_text[i][j]) # print type(train_text[i][j] ) # selector = VarianceThreshold() # data = selector.fit_transform(train_text) # index = selector.get_support(True) # train = data # test = [] # df = pd.DataFrame(test_text) # for line in index: # test.append(df[line]) X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]] selector = VarianceThreshold() selector.fit_transform(X) print selector.get_support() # clf = DecisionTreeClassifier(max_depth=4) # clf = SVC(kernel='rbf', probability=True) # clf.fit(data, train_classfi) # result = clf.predict(test_text)
X_positive = X[:, [-2]] #------------------------------------------------------------------------------ from minepy import MINE m = MINE() save_columns = [] for i in range(0, len(X[0])): m.compute_score(X[:, i], y) #print(i, m.mic()) if m.mic() >= 0.1: save_columns.append(i) X = X[:, save_columns] df = pd.DataFrame(X) from sklearn.feature_selection import VarianceThreshold val_selection = VarianceThreshold(threshold=(0.1 * (1 - 0.1))) X = val_selection.fit_transform(X) X = np.hstack((X_positive, X)) #------------------------------------------------------------------------------ #-----------------------------------切割数据集---------------------------------- X_old = X[:original_len, :] X_old = np.delete(X_old, [220, 312], axis=0) y_old = y[:original_len] y_old = np.delete(y_old, [220, 312], axis=0) X_new = X[original_len:, :] #------------------------------------------------------------------------------ #------------------------------------特征缩放----------------------------------- from sklearn.preprocessing import StandardScaler
def variance(self, X, threshold): from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) sel_var = sel.fit_transform(X) X = self.X[X.columns[sel.get_support(indices=True)]] return X
print I.shape X_train = Data y_train = Targets # X_train, X_test, y_train, y_test = \ # train_test_split(Data, Targets, test_size=0.33, random_state=42) # cut.make_cut(X_test) # cut_test = cut.cut # # filter.calculate_prewitt(cut_test) # desc_test = filter.flatten(filter.transformed) pipe = Pipeline([('cut', CenterCutCubes(size_cubes=5)), ('scl', StandardScaler()), ('var', VarianceThreshold(100)), ('pca', PCA()), ('clf', SVR(kernel='linear'))]) param_range_svm = [0.01, 0.1, 1] param_range_cut_left = range(50, 110, 5) param_range_cut_right = [80, 100, 120, 150] param_range_size_cube = [1, 3, 5, 10] gs = GridSearchCV(estimator=pipe, param_grid=[{ 'cut__size_cubes': [5], 'cut__y1': [80], 'cut__x1': [50], 'cut__z1': [50], 'cut__x2': [120], 'cut__y2': [150], 'cut__z2': [100], 'clf__C': [0.1],
def pre_data(flag, type): if type == 'flow': dataset_b, dataset_w, dataset_t = data_read_flow() else: dataset_b, dataset_w, dataset_t = data_read() dataset = np.vstack((dataset_b, dataset_w, dataset_t)) print(dataset.shape) # 前6000 训练集合,后4000测试集合 time = [] # 6-21 payload = [] # 22-34 tcp_flag = [] # 35-42 cipher = [] # subject,issue, certificate_time. self_signed, cipher_num(58), cipher(61) ,cipher_content_ratio(63) cipher_version speed = [] # 43 - 50 ip = [] subject = [] issue = [] cipher_version = [] label = [] matrix = [] flow = [] # 65 bitFre = [] entropy = [] cipher_bifFre = [] cipher_entropy = [] label_e = [] for key in dataset: flow_one = [] for j in range(0, 3): flow_one.append(float(key[j])) for j in range(4, 51): flow_one.append(float(key[j])) certificate_time = int(find_min((key[52]))) # certificate_time self_signed = find_self_signed((key[51])) # 自签名 flow_one.append(certificate_time) flow_one.append(self_signed) flow.append(flow_one) cipher_one = [] cipher_one.append(key[58]) cipher_one.append(key[61]) cipher_one.append(key[63]) time.append(key[6:22]) tcp_flag.append(key[35:43]) payload.append(key[22:35]) speed.append(key[43:51]) if key[-2] == 'black': label.append(1) elif key[-2] == 'white': label.append(0) else: label.append(0) ip.append(key[3]) max_cip_version = 0 for tem in key[-11]: try: if int(tem) > max_cip_version: max_cip_version = int(tem) except ValueError: max_cip_version = -1 cipher_version.append(max_cip_version) subject_one = Find_first(key[53]) issue_one = Find_first(key[54]) cipher_one.append(max_cip_version) if key[63] != 0: bitFre.append(key[65]) entropy.append(key[66:70]) cipher_bifFre.append(key[71]) cipher_entropy.append(key[73:76]) if key[-2] == 'black': label_e.append(1) else: label_e.append(0) subject.append(subject_one) issue.append(issue_one) cipher.append(cipher_one) ip_ans = oh_encoding(ip) subject_ans = oh_encoding(subject) issue_ans = oh_encoding(issue) cipher = np.hstack((cipher, subject_ans, issue_ans)) mean_list = [8, 12, 16, 20, 23, 26, 29, 32] from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import MinMaxScaler select = VarianceThreshold(threshold=0) dataset_flow = select.fit_transform(flow) minMax = MinMaxScaler() dataset_flow = minMax.fit_transform(dataset_flow) # dataset_mix = (np.hstack((flow, subject_ans, issue_ans, matrix))) # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i])) print("dataset is formed by {}".format(flag)) ratio = len(dataset_b) + len(dataset_w) if flag == 'flow': return dataset_flow[:ratio], dataset_flow[ ratio:], label[:ratio], label[ratio:] elif flag == 'subject': return subject_ans[:ratio], subject_ans[ratio:], label[:ratio], label[ ratio:] elif flag == 'issue': return issue_ans[:ratio], issue_ans[ratio:], label[:ratio], label[ ratio:] elif flag == 'matrix': return matrix[:ratio], matrix[ratio:], label[:ratio], label[ratio:] elif flag == 'payload': return payload[:ratio], payload[ratio:], label[:ratio], label[ratio:] elif flag == 'time': return time[:ratio], time[ratio:], label[:ratio], label[ratio:] elif flag == 'cipher': return cipher[:ratio], cipher[ratio:], label[:ratio], label[ratio:] elif flag == 'flag': return tcp_flag[:ratio], tcp_flag[ratio:], label[:ratio], label[ratio:] elif flag == 'speed': return speed[:ratio], speed[ratio:], label[:ratio], label[ratio:] elif flag == 'bitFre': return bitFre, label_e elif flag == 'entropy': return entropy, label_e elif flag == 'cipher_entropy': return cipher_entropy, label_e elif flag == 'cipher_bitFre': return cipher_bifFre, label_e else: print("select wrong")