Ejemplo n.º 1
0
def pre_data_flow(flag):
    """
    将分割为流后的数据集进行一个数据预处理
    :param flag: 返回什么特征集合
    :return: 返回的特征集合
    """
    dataset_train_b = np.load("feature_flow/train_black.npy",
                              allow_pickle=True)
    dataset_train_w = np.load("feature_flow/train_white.npy",
                              allow_pickle=True)
    dataset_test_b = np.load("feature_flow/test_black.npy", allow_pickle=True)
    dataset_test_w = np.load("feature_flow/test_white.npy", allow_pickle=True)
    dataset_train = np.vstack((dataset_train_b, dataset_train_w))
    dataset_test = np.vstack((dataset_test_b, dataset_test_w))
    dataset = np.vstack((dataset_train, dataset_test))

    # 前6000 训练集合,后4000测试集合
    ip = []
    subject = []
    issue = []
    cipher_version = []
    label = []
    matrix = []
    for key in dataset:
        if (key[-5] != 0):
            if key[-2] == 'black':
                label.append(0)
            elif key[-2] == 'white':
                label.append(1)
            else:
                label.append(1)
            ip.append(key[3])
            max_cip_version = 0
            # for tem in key[-11]:
            #     try:
            #         if int(tem) > max_cip_version:
            #             max_cip_version = int(tem)
            #     except ValueError:
            #         max_cip_version = -1
            cipher_version.append(max_cip_version)
            subject.append(Find_first(key[53]))
            issue.append(Find_first(key[54]))
            # print(key[-3].reshape(1,-1))
            # print(key[-3].flatten())
            matrix.append(key[-9].flatten())
    ip_ans = oh_encoding(ip)
    subject_ans = oh_encoding(subject)
    issue_ans = oh_encoding(issue)
    dataset_flow = []
    mean_list = [8, 12, 16, 20, 23, 26, 29, 32]
    from sklearn.feature_selection import VarianceThreshold
    for i in range(len(dataset)):
        feature = []
        if dataset[i][-5] != 0:
            for j in range(0, 3):
                feature.append(float(dataset[i][j]))
            for j in range(4, 51):
                feature.append(float(dataset[i][j]))
            # for j in range(4, 6):
            #     feature.append(float(dataset[i][j]))
            # for j in mean_list:
            #     feature.append(float(dataset[i][j]))
            feature.append(int(find_min((dataset[i][52]))))
            # certificate_time
            feature.append(find_self_signed((dataset[i][51])))
            # 自签名
            dataset_flow.append(feature)
    from sklearn.preprocessing import MinMaxScaler

    select = VarianceThreshold(threshold=0)
    dataset_flow = select.fit_transform(dataset_flow)
    minMax = MinMaxScaler()
    dataset_flow = minMax.fit_transform(dataset_flow)
    dataset_mix = (np.hstack((dataset_flow, subject_ans, issue_ans, matrix)))
    # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i]))
    print("dataset is formed by {}".format(flag))
    dataset_mix = select.fit_transform(dataset_mix)
    num = len(dataset_train_b) + len(dataset_train_w)
    if flag == 'flow':
        return dataset_flow[:num], dataset_flow[num:], label[:num], label[num:]
    elif flag == 'subject':
        return subject_ans[:num], subject_ans[num:], label[:num], label[num:]
    elif flag == 'issue':
        return issue_ans[:num], issue_ans[num:], label[:num], label[num:]
    elif flag == 'matrix':
        return matrix[:num], matrix[num:], label[:num], label[num:]
    elif flag == 'mix':
        return dataset_mix[:num], dataset_mix[num:], label[:num], label[num:]
    else:
        print("select wrong")
Ejemplo n.º 2
0
cpca = (pca_c.fit(data[CELLS]))
train2 = (cpca.transform(train_features[CELLS]))
test2 = (cpca.transform(test_features[CELLS]))

train_cpca = pd.DataFrame(train2,
                          columns=[f'pca_C-{i}' for i in range(n_comp)])
test_cpca = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

train_features = pd.concat((train_features, train_cpca), axis=1)
test_features = pd.concat((test_features, test_cpca), axis=1)
dump(cpca, open('cpca.pkl', 'wb'))
print('pca done')

from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.85)  #<-- Update
data = train_features.append(test_features)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[:train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0]:]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat(
    [train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
Ejemplo n.º 3
0
S_pca = pca.fit_transform(features)

ica = FastICA(n_components=3)
S_ica = ica.fit_transform(features)

rpg = random_projection.GaussianRandomProjection(n_components=3)
g_rpg = rpg.fit_transform(features)

spg = random_projection.SparseRandomProjection(n_components=3)
s_rp = spg.fit_transform(features)

threshold = [
    .01, .02, .03, .04, .05, .1, .20, .25, .30, .4, .5, .6, .7, .8, .9, 1
]

lvf = VarianceThreshold()
t_lvf = lvf.fit_transform(X_train)

components = range(1, 31)
model = LinearSVC()
model.fit(X_train, y_train)
baseline = metrics.accuracy_score(model.predict(X_calibrate), y_calibrate)
acc = []


def lowV():
    for thresh in threshold:
        lvf = VarianceThreshold(threshold=thresh)
        spD = lvf.fit_transform(X_train)
        model = LinearSVC()
        model.fit(spD, y_train)
    msg_to_poi = my_dataset[person]['from_this_person_to_poi']
    from_msg = my_dataset[person]['from_messages']
    if msg_to_poi != "NaN" and from_msg != "NaN":
        my_dataset[person]['msg_to_poi_ratio'] = msg_to_poi / float(from_msg)
    else:
        my_dataset[person]['msg_to_poi_ratio'] = 0
new_features_list = features_list + ['msg_to_poi_ratio', 'msg_from_poi_ratio']

## Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, new_features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

#Select the best features:
#Removes all features whose variance is below 80%
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features = sel.fit_transform(features)

#Removes all but the k highest scoring features
from sklearn.feature_selection import f_classif
k = 7
selector = SelectKBest(f_classif, k=7)
selector.fit_transform(features, labels)
print("Best features:")
scores = zip(new_features_list[1:], selector.scores_)
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
print sorted_scores
optimized_features_list = poi_label + list(map(lambda x: x[0],
                                               sorted_scores))[0:k]
print(optimized_features_list)
Ejemplo n.º 5
0
# create a Random Forest Classifier
print('creating a model...')
# create a tree to select features
tree_cat = RandomForestRegressor(n_jobs=n_jobs,
	random_state=1, n_estimators=10,
	max_features='sqrt', max_depth=10)
tree_cont = RandomForestRegressor(n_jobs=n_jobs,
	random_state=1, n_estimators=10,
	max_features='sqrt', max_depth=10)

# some feature selection
print('selecting features...')

# use variance threshold to select features
# many of the features are in categories with few vars
selector_variance_cat = VarianceThreshold(threshold=0.1)
X_cat = selector_variance_cat.fit_transform(X_cat)
print('shape of X_cat after variance threshold')
print(X_cat.shape)

# create a basic tree for continuous features
print('fitting tree to continuous data...')
tree_cont.fit(X_cont, y)
feature_importances_cont = tree_cont.feature_importances_
feature_mapping_cont = {importance:idx for idx, importance in \
	enumerate(feature_importances_cont)}
sorted_features_cont = feature_importances_cont.argsort()
sorted_indices_cont = []
print(sorted_features_cont)
for x in sorted_features_cont[:num_features_cont]:
	sorted_indices_cont.insert(0, x)
Ejemplo n.º 6
0
def variance_three_shold(X):
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    return sel.fit_transform(X)
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(),
                     PolynomialFeatures(), RobustScaler(), StandardScaler(),
                     FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(),
                     SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
                     SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
                     RFE(estimator=ExtraTreesClassifier(n_estimators=100))]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    for (preprocessor, n_estimators, min_weight_fraction_leaf,
         max_features, criterion) in itertools.product(
                preprocessor_list,
                [10, 50, 100, 500, 1000],
                np.arange(0., 0.51, 0.05),
                [0.1, 0.25, 0.5, 0.75, 'sqrt', 'log2', None],
Ejemplo n.º 8
0
 def transform(self, X):
     data = X.copy()
     selector = VarianceThreshold(threshold=self._threshold)
     selector.fit(data)
     return data[data.columns[selector.get_support(indices=True)]]
Ejemplo n.º 9
0
 def __init__(self, conf):
     UnsupervisedFeatureSelection.__init__(self, conf)
     self.projection = VarianceThreshold()
 def variance_thresholding(self, dataset):
     selector = VarianceThreshold(threshold=.02)
     selector.fit(dataset)
     return selector.get_support(indices=True)
Ejemplo n.º 11
0
x_columns = X.columns
x_dtypes = X.dtypes
x_str = np.where(x_dtypes == "object")[0]

# convert any string columns to binary columns
X = pd.get_dummies(X, columns=x_columns[x_str])

# In[1]: Model the data

# set up cross validation for time series
tscv = TimeSeriesSplit(n_splits=5)
folds = tscv.get_n_splits(X)

# set up a machine learning pipeline
pipeline = Pipeline([
    ('var', VarianceThreshold()),
    ('scale', MinMaxScaler()),
    ('model', RandomForestRegressor(n_estimators=50)),
])

# set up the grid search
parameters = {
    'model__max_depth': [6, 9, 12, 15],
    'model__min_samples_leaf': [1, 3, 5, 7],
}
grid_search = GridSearchCV(pipeline,
                           parameters,
                           cv=folds,
                           n_jobs=-1,
                           verbose=0)
Ejemplo n.º 12
0
    def __init__(self,
                 data_set_file,
                 lowercase=False,
                 use_idf=False,
                 developers_dict_file=None,
                 developers_list_file=None):
        """Constructor
        
        The data in the data set file are loaded. The pre-processing 
        techniques, the feature extraction techniques and the feature 
        selection techniques to use are selected.
        
        :param data_set_file: The absolute path of the data set file.
        :type data_set_file: string.
        :param lowercase: To decide whether or not conversion to lower
        case should be applied.
        :type lowercase: boolean.
        :param use_idf: To decide whether or not the inverse document 
        frequencies of the tf-idf formula should be used.
        :type use_idf: boolean.
        :param developers_dict_file: The absolute path of a JSON file 
        containing a mapping of developers names to other strings. 
        Not implemented yet: it should be done later if needed.
        :type developers_dict_file: string.
        :param developers_list_file: The absolute path of a JSON file 
        allowing us to filter out the data set based on the names of 
        the developers. Not implemented yet: it should be done later 
        if needed.
        :type developers_list_file: string.
        """
        super().__init__(developers_dict_file, developers_list_file)
        np.random.seed(0)  # We set the seed
        self.lowercase = lowercase
        self.use_idf = use_idf

        self._pre_processing_steps = [("count", CountVectorizer( \
        lowercase=lowercase, token_pattern=u"(?u)\S+")), \
        ("tf_idf", TfidfTransformer(use_idf=use_idf, smooth_idf=False))]

        self._feature_selection_methods = [
            ("var_threshold", VarianceThreshold()),
            ("chi2", SelectPercentile(chi2)),
            ("f_classif", SelectPercentile(f_classif)),
            ("mutual_info_classif", SelectPercentile(mutual_info_classif))
        ]

        self._feature_selection_methods_params = [
            dict(var_threshold__threshold=[((
                (1 - 0)**2) / 12) * i for i in np.arange(0.01, 0.1, 0.02)]),
            dict(chi2__percentile=list(range(10, 100, 20))),
            dict(f_classif__percentile=list(range(10, 100, 20))),
            dict(mutual_info_classif__percentile=list(range(10, 100, 20)))
        ]

        self._classifiers_estimators = { \
            "Linear SVM": [("clf", LinearSVC(random_state=0))], \
            # "MultinomialNB": [("clf", MultinomialNB())],
            # "LogisticRegression": [("clf", LogisticRegression(random_state=0, class_weight="balanced", multi_class="multinomial", n_jobs=-1))] \
        }

        # Below, there is a dictionary to store the names, the
        # classifiers used, the parameters sent to the constructor of
        # the classifiers and the fitted classifiers
        self._models_cv = {}

        # Below, there is a dictionary to store the names, the pipelines
        # used, the parameters sent to the constructor of the feature
        # selection techniques
        self._rfe_cv = {
            "RFECV SVM": [RFECV, {
                "estimator": LinearSVC(random_state=0),
                "step": 0.1,
                "cv": self._tscv,
                "scoring": accuracy_mrr_scoring_object,
                "verbose": 10,
                "n_jobs": -1
            }, None], \
            # "RFECV Naive Bayes": [RFECV, {
            #     "estimator": MultinomialNB(),
            #     "step": 0.1,
            #     "cv": self._tscv,
            #     "verbose": 1,
            #     "n_jobs": -1
            # }, None]







        }

        for key, classifier_estimator in self._classifiers_estimators.items():
            for i, feature_selection_method in enumerate(
                    self._feature_selection_methods):
                self._models_cv["GridSearch " + feature_selection_method[0] + " " + key] = [GridSearchCV, { \
                    "estimator": Pipeline(self._pre_processing_steps + [feature_selection_method] + classifier_estimator), \
                    "param_grid": self._feature_selection_methods_params[i], \
                    "n_jobs": -1, \
                    "iid": False, \
                    "cv": self._tscv, \
                    "verbose": 10, \
                    "error_score": np.array([-1, -1]), \
                    "scoring": accuracy_mrr_scoring_object
                }, None]

        cleaned_results_file_name = "cleaned_feature_selection_" + \
        "experiment_results.json"
        self._cleaned_results_file_name = os.path.join( \
        self._current_dir, cleaned_results_file_name)

        self._data_set_file = os.path.join(self._current_dir, \
        data_set_file)

        log_file = os.path.join(self._current_dir, \
                                "feature_selection_experiment.log")
        logging.basicConfig(filename=log_file, filemode="w", \
                            level=logging.DEBUG)

        self._build_data_set()
Ejemplo n.º 13
0
def feature_selection(X, p):
    sel = VarianceThreshold(threshold=p * (1 - p))
    print('before feature selection: {} features'.format(X.shape[1]))
    X_after_feature_selection =  sel.fit_transform(X)
    print('after feature selection: {} features'.format(X_after_feature_selection.shape[1]))
    return X_after_feature_selection,sel.get_support(indices=True)
Ejemplo n.º 14
0
def pre_data_1():
    dataset_b, dataset_w = data_read()
    dataset_train = dataset_b[:3000] + dataset_w[:3000]
    dataset_test = dataset_b[3000:] + dataset_w[3000:]
    dataset = dataset_train + dataset_test
    # 前6000 训练集合,后4000测试集合
    ip = []
    subject = []
    issue = []
    cipher_version = []
    label = []
    matrix = []
    for key in dataset:
        if key[-2] == 'black':
            label.append(1)
        elif key[-2] == 'white':
            label.append(0)
        else:
            label.append(0)
        ip.append(key[3])
        max_cip_version = 0
        for tem in list_string(key[-11]):
            try:
                if int(tem) > max_cip_version:
                    max_cip_version = int(tem)
            except ValueError:
                max_cip_version = -1
        cipher_version.append(max_cip_version)
        subject.append(find_first(list_string(key[53])))
        issue.append(find_first(list_string(key[54])))
        print(list_string(key[-3]))
        matrix.append(list_string(key[-3]))
    ip_ans = oh_encoding(ip)
    subject_ans = oh_encoding(subject)
    issue_ans = oh_encoding(issue)
    dataset_flow = []
    mean_list = [8, 12, 16, 20, 23, 26, 29, 32]
    from sklearn.feature_selection import VarianceThreshold
    for i in range(len(dataset)):
        feature = []
        for j in range(0, 3):
            feature.append(float(dataset[i][j]))
        for j in range(4, 51):
            feature.append(float(dataset[i][j]))
        # for j in range(4, 6):
        #     feature.append(float(dataset[i][j]))
        # for j in mean_list:
        #     feature.append(float(dataset[i][j]))
        feature.append(int(find_min(list_string(dataset[i][52]))))
        # certificate_time
        feature.append(find_self_signed(list_string(dataset[i][51])))
        # 自签名
        dataset_flow.append(feature)
    from sklearn.preprocessing import MinMaxScaler

    select = VarianceThreshold(threshold=0)
    dataset_flow = select.fit_transform(dataset_flow)
    minMax = MinMaxScaler()
    dataset_flow = minMax.fit_transform(dataset_flow)
    dataset_mix = []
    for i in range(len(dataset)):
        dataset_mix.append(np.hstack((dataset_flow[i], subject_ans[i])))
        # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i]))
    print("dataset is formed by {}".format("mixed"))
    dataset_mix = select.fit_transform(dataset_mix)
Ejemplo n.º 15
0
x_train,x_test,y_train,y_test = train_test_split(data.drop("status",axis=1),data["status"],random_state=2018,test_size=0.3)

# 把其中的非数字型转为one-hot
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
trans = DictVectorizer()
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)

# 标准化
trans = StandardScaler(with_mean=False)
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)

# 过滤低方差特征
trans = VarianceThreshold(threshold=1)
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)
print(x_train.shape)

estimator = RandomForestClassifier(n_estimators=200,max_depth=80)
# estimator = GradientBoostingClassifier(random_state=10)
# estimator = KNeighborsClassifier(n_neighbors=50)
# estimator = LogisticRegression()
# estimator = XGBClassifier(learning_rate=0.01,
#                       n_estimators=200,           # 树的个数-10棵树建立xgboost
#                       max_depth=30,               # 树的深度
#                       min_child_weight = 1,      # 叶子节点最小权重
#                       gamma=0.,                  # 惩罚项中叶子结点个数前的参数
#                       subsample=1,               # 所有样本建立决策树
#                       colsample_btree=1,         # 所有特征建立决策树
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.9061619988129817
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RidgeCV()), VarianceThreshold(threshold=0.001),
    GradientBoostingRegressor(alpha=0.8,
                              learning_rate=0.1,
                              loss="huber",
                              max_depth=4,
                              max_features=1.0,
                              min_samples_leaf=18,
                              min_samples_split=3,
                              n_estimators=100,
                              subsample=0.6500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 17
0
                'optics': OPTICS()})
#methods.update({'dbscan_e{}'.format(i): DBSCAN(eps=i/10) for i in range(1, 10, 1)})

metrics = {'silhouette_score': silhouette_score,
           'davies_bouldin_score': davies_bouldin_score,
           'calinski_harabasz_score': calinski_harabasz_score}

clusters = {k: [] for k in methods.keys()}

metric_measures = pd.DataFrame(columns=list(methods.keys()), index=list(metrics.keys()))

data = pd.read_csv('player_processed.csv', index_col=0).dropna(how="all").fillna(0)


scaler = RobustScaler()
selector = VarianceThreshold(MIN_STD)
reductor = SparseRandomProjection(N_FEATURES, random_state=1)

scaled = scaler.fit_transform(data)
scaled[scaled > 10] = 10
scaled[scaled < -10] = -10

data_scaled = pd.DataFrame(scaled,
                           columns=data.columns,
                           index=data.index)
data_scaled.fillna(0, inplace=True)
data_selected = selector.fit_transform(data_scaled)
data_selected = pd.DataFrame(data_selected,
                             columns=selector.transform(data_scaled.columns.values.reshape(-1, 1).T).flatten(),
                             index=data.index)
corr_matrix = data_selected.corr().abs()
Ejemplo n.º 18
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          sep='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.24),
    ExtraTreesClassifier(criterion="entropy",
                         max_features=0.16,
                         n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 19
0
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')


wine = datasets.load_wine()
X = wine.data
y = wine.target
#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)    

pca = PCA()
ica = FastICA()
rp = GaussianRandomProjection(n_components=8)
fs = VarianceThreshold(threshold=0.1)

x_pca = pca.fit_transform(X)
x_ica = ica.fit_transform(X)
x_rp = rp.fit_transform(X)
x_fs = fs.fit_transform(X)

fig = plt.figure()
# plt.xlim(-1,1)
# plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()

#Call the function. Use only the 2 PCs.
myplot(x_pca[:,0:2],np.transpose(pca.components_[0:2, :]))
Ejemplo n.º 20
0
def main():
    result = {}
    for _sym in SYMBOLS:
        dataset = 'data/result/datasets/csv/{}.csv'.format(_sym)
        df = pd.read_csv(dataset,
                         sep=',',
                         encoding='utf-8',
                         index_col='Date',
                         parse_dates=True)
        df = df.replace([np.inf, -np.inf], np.nan).dropna()
        X = df[df.columns.difference(['target', 'target_pct', 'target_label'])]
        y = df['target']
        #print("======"+_sym+"======")
        #print(X.info())

        # Variance Threshold
        sel = VarianceThreshold()
        sel.fit_transform(X)
        sup = sel.get_support()
        X = X[[name for flag, name in zip(sup, X.columns) if flag]]
        ## SelectKBest
        sel = SelectKBest(chi2, k=30)
        sX = scale(X, scaler='minmax')
        sel.fit_transform(sX, y)
        sup = sel.get_support()
        sX = sX[[name for flag, name in zip(sup, sX.columns) if flag]]

        ## Recursive Feature Elimination
        # Create the RFE object and compute a cross-validated score.
        # The "accuracy" scoring is proportional to the number of correct
        # classifications
        # model = SVC(kernel="linear")
        # rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring='accuracy', n_jobs=-1, verbose=1)
        # rfecv.fit(X, y)
        # X = X[[name for flag, name in zip(rfecv.support_, X.columns) if flag]]
        ### Genetic
        # estimator = MLPClassifier(**{
        #     'hidden_layer_sizes': (10, 4),
        #     'solver': 'lbfgs',
        #     'learning_rate': 'constant',
        #     'learning_rate_init': 0.001,
        #     'activation': 'logistic'
        # })
        estimator = LogisticRegression(solver="liblinear", multi_class="ovr")
        gscv = GeneticSelectionCV(estimator,
                                  cv=2,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=30,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=80,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
        gscv = gscv.fit(X, y)
        X = X[[name for flag, name in zip(gscv.support_, X.columns) if flag]]

        #print(X.columns)

        # print("[%s] Optimal number of features : %d Set: %s" % (_sym, rfecv.n_features_, ', '.join(X.columns)))
        # plt.figure()
        # plt.title(_sym + ' SVC RFECV K=2')
        # plt.xlabel("Number of features selected")
        # plt.ylabel("Cross validation score (nb of correct classifications)")
        # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        # plt.show()

        logger.info("{}: {}".format(_sym, X.columns))
        result[_sym] = {
            'dataset': dataset,
            'columns_genetic_lr_30': [c for c in X.columns],
            'columns_kbest_30': [c for c in sX.columns]
        }
    return result
Ejemplo n.º 21
0
    pd_train_dataset_reduced['TARGET'],
    test_size=0.3,
    random_state=0)

print("\t*) TRAIN DATASET DIMENSION")
print(X_train.shape)
print("\t*) TEST DATASET DIMENSION")
print(X_test.shape)

### 3. Using variance threshold from sklearn
'''
Variance threshold from sklearn is a simple baseline approach to feature selection. 
It removes all features which variance doesn’t meet some threshold. By default, it removes all 
zero-variance features, i.e., features that have the same value in all samples.
'''
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)  # fit finds the features with zero variance

# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant
print(
    "*) Number of feature that are NOT CONSTANT using get_support() on VarianceThreshold"
)
print(sum(sel.get_support()))

# another way of finding non-constant features is like this
print(
    "*) Number of feature that are NOT CONSTANT using get_support() on train columns dataset"
)
print(len(X_train.columns[sel.get_support()]))
def main():
    # Get data
    X_train, y_train_log, X_test, id_test = get_input(debug)

    # Remove constant columns
    variance_checker = VarianceThreshold(threshold=0.0)
    xtrain = variance_checker.fit_transform(X_train)
    xtest = variance_checker.transform(X_test)

    # Remove duplicate columns
    unique_transformer = UniqueTransformer()
    unique_transformer.fit(X_train)
    xtrain = unique_transformer.transform(X_train)
    xtest = unique_transformer.transform(X_test)

    # Define feature union
    data_union = FeatureUnion([
        ('pca', PCA(n_components=100)),
        ('ct-2', ClassifierTransformer(get_rfc(), n_classes=2, cv=5)),
        ('ct-3', ClassifierTransformer(get_rfc(), n_classes=3, cv=5)),
        ('ct-4', ClassifierTransformer(get_rfc(), n_classes=4, cv=5)),
        ('ct-5', ClassifierTransformer(get_rfc(), n_classes=5, cv=5)),
        ('st', StatsTransformer(verbose=2))
    ])

    # Transform data
    data_union.fit(X=xtrain, y=y_train_log)
    print '\nCreating processed training set...\n'
    train_data = data_union.transform(xtrain)
    print '\nCreating processed test set...\n'
    test_data = data_union.transform(xtest)

    # Scale data
    xdata = np.concatenate([train_data, test_data], axis=0)
    scaler = StandardScaler()
    xdata_scaled = scaler.fit_transform(X=xdata)
    train_scaled = xdata_scaled[:len(X_train), :]
    test_scaled = xdata_scaled[len(X_train):, :]

    # Load KLIEP importance weights
    if debug:
        cs_path = './covariate_shift/debug_cs_weights_v1/'
    else:
        cs_path = './covariate_shift/full_cs_weights_v1/'
    cs_temp = '0_width%s_numk%s.pickle'%(gw_val, num_kernels)

    weight_path = cs_path + cs_temp
    if os.path.exists(weight_path):
        kliep_set = load_pickle(weight_path)
    weights = np.array(kliep_set['weights'])

    # Train XGBoost Regressor
    # Custom objective function for modified ordinary least squares
    def kliep_objective(y_true, y_pred):
        # Get split indexes
        split_list = copy.deepcopy(kf_list)
        target_idx = split_list[cv_counter][0]
        # Calculate 1st and 2nd derivatives
        grad = np.multiply(weights[target_idx], np.subtract(y_pred, y_true))
        hess = weights[target_idx]
        return grad, hess

    # Custom evaluation function for RMSLE
    def rmsle_eval(y_predicted, y_true):
        labels = y_true.get_label()
        pred = np.log1p(y_predicted)
        real = np.log1p(labels)
        err = np.subtract(pred, real)
        return 'rmsle', np.sqrt(np.mean(np.square(err)))

    # XGBoost regressor parameters
    xgb_params = {'n_estimators': 1000,
                  'objective': kliep_objective,
                  'booster': 'gbtree',
                  'learning_rate': 0.02,
                  'max_depth': 22,
                  'min_child_weight': 57,
                  'gamma' : 1.45,
                  'alpha': 0.0,  # No regularization
                  'lambda': 0.0,  # No regularization
                  'subsample': 0.67,
                  'colsample_bytree': 0.054,
                  'colsample_bylevel': 0.50,
                  'n_jobs': -1,
                  'random_state': 456}
    # Fitting XGB Regressor parameters
    fit_params = {'early_stopping_rounds': 15,
                  'eval_metric': rmsle_eval,
                  'verbose': False}

    # Define KFold split
    kf_split = KFold(n_splits=cv_val, shuffle=False, random_state=random_state).split(train_scaled, y_train_log)
    kf_list = list(kf_split)

    # Train xgboost regressor
    reg_kliep = XGBRegressorCV_KLIEP(xgb_params=xgb_params, fit_params=fit_params)
    reg_kliep.fit(X=train_scaled, y=y_train_log, kf_list=kf_list)
    # Get predictions
    y_pred_log = reg_kliep.predict(X=test_scaled)
    y_pred = np.expm1(y_pred_log)
    # Format submission
    submission_path = '../submissions/xgb_kliep_1v0_submit.csv'
    submission = pd.DataFrame()
    submission['ID'] = id_test
    submission['target'] = y_pred
    # Save submissions
    submission.to_csv(submission_path, index=False)
Ejemplo n.º 23
0
import numpy as np
from sklearn.datasets import load_iris

# 载入iris数据集
data = load_iris()
# 数据分布
X = data['data']
y = data['target']

# ## 代码开始
n, d = X.shape  # 样本个数 特征个数
means = np.mean(X)  # 数据X的各特征均值
stds = np.std(X)  # 数据X的个特征标准差
# ## 代码结束
print('样本个数为:' + str(n) + '\n特征个数为:' + str(d))
print('样本各特征均值为:\n')
print(means)
print('样本各特征方差为:\n')
print(stds**2)
# 根据方差特征选择
from sklearn.feature_selection import VarianceThreshold

# ## 代码开始
sel = VarianceThreshold(threshold=0.6)  # 选择0.6作为方差阈值
X_new = sel.fit_transform(X)  # 经过选择后的特征
# ## 代码结束

stds_new = np.std(X_new, axis=0)
print('选择特征后样本各特征方差为:\n')
print(stds_new**2)
Ejemplo n.º 24
0
pipeline_categorical = Pipeline([
    ('selector_categorical', ColumnExtractor(columns=categorical_columns)),
    ('imputer_missing_values',
     SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('ClassicMultipleBinarizer', ClassicMultipleBinarizer())
])
#%% union of pipelines

pipeline_features = FeatureUnion([('pipeline_numerical', pipeline_numerical),
                                  ('pipeline_categorical',
                                   pipeline_categorical)])
#%% pipeline union

pipeline_union = Pipeline([
    ('preprocessed_data', pipeline_features),
    ('feature_selection', VarianceThreshold()),
    ('feature_extraction', PCA(n_components=20)),  #11)),
    ('scaler', StandardScaler())
])

data_procesada = pipeline_union.fit_transform(df)

#%% use  RandomizedSearchCV and select the best estimator

param_dist_random = {
    "max_depth": [3, None],
    "max_features": sp_randint(1, 20),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
Ejemplo n.º 25
0
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# load data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
ids_tr = train.pop('id').values
ids_te = test.pop('id').values
magic_tr = train.pop('wheezy-copper-turtle-magic').values
magic_te = test.pop('wheezy-copper-turtle-magic').values
target = train.pop('target').values
train = train.values
test = test.values

# infomative columns of each magic value
vt = VarianceThreshold(threshold=1.5)
infomative_cols = []
for i in range(512):
    vt.fit(train[magic_tr == i])
    infomative_cols.append(vt.get_support(indices=True))

### Step-1 ###
oof_all = []
pred_all = []
for n in range(1, MAX_COMPONENTS + 1):
    oof_n = np.zeros(len(train))
    pred_n = np.zeros(len(test))
    gmm0 = GaussianMixture(n_components=n,
                           covariance_type='full',
                           random_state=RANDOM_SEED)
    gmm1 = GaussianMixture(n_components=n,
Ejemplo n.º 26
0
        total = total + float(tp + tn) / (tp + tn + fp + fn) * 100
    return total / len(labels)


# train_text,train_classfi_number,train_classfi,train_feature_name = getTargetData("Breast_train.data")
# test_text,test_classfi_number,test_classfi,test_feature_name = getTargetData("Breast_test.data")

# for i in range(len(train_text)):
#         for j in range(len(train_text[0])):
#             train_text[i][j] = float(train_text[i][j])
#             print type(train_text[i][j] )

# selector = VarianceThreshold()
# data = selector.fit_transform(train_text)
# index = selector.get_support(True)

# train = data
# test = []
# df = pd.DataFrame(test_text)
# for line in index:
# 	test.append(df[line])

X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
print selector.get_support()
# clf = DecisionTreeClassifier(max_depth=4)
# clf = SVC(kernel='rbf', probability=True)
# clf.fit(data, train_classfi)
# result = clf.predict(test_text)
Ejemplo n.º 27
0
X_positive = X[:, [-2]]
#------------------------------------------------------------------------------

from minepy import MINE
m = MINE()
save_columns = []
for i in range(0, len(X[0])):
    m.compute_score(X[:, i], y)
    #print(i, m.mic())
    if m.mic() >= 0.1:
        save_columns.append(i)

X = X[:, save_columns]
df = pd.DataFrame(X)
from sklearn.feature_selection import VarianceThreshold
val_selection = VarianceThreshold(threshold=(0.1 * (1 - 0.1)))
X = val_selection.fit_transform(X)
X = np.hstack((X_positive, X))

#------------------------------------------------------------------------------

#-----------------------------------切割数据集----------------------------------
X_old = X[:original_len, :]
X_old = np.delete(X_old, [220, 312], axis=0)
y_old = y[:original_len]
y_old = np.delete(y_old, [220, 312], axis=0)
X_new = X[original_len:, :]
#------------------------------------------------------------------------------

#------------------------------------特征缩放-----------------------------------
from sklearn.preprocessing import StandardScaler
Ejemplo n.º 28
0
 def variance(self, X, threshold):
     from sklearn.feature_selection import VarianceThreshold
     sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
     sel_var = sel.fit_transform(X)
     X = self.X[X.columns[sel.get_support(indices=True)]]
     return X
print I.shape

X_train = Data
y_train = Targets

# X_train, X_test, y_train, y_test = \
#     train_test_split(Data, Targets, test_size=0.33, random_state=42)

# cut.make_cut(X_test)
# cut_test = cut.cut
#
# filter.calculate_prewitt(cut_test)
# desc_test = filter.flatten(filter.transformed)

pipe = Pipeline([('cut', CenterCutCubes(size_cubes=5)),
                 ('scl', StandardScaler()), ('var', VarianceThreshold(100)),
                 ('pca', PCA()), ('clf', SVR(kernel='linear'))])
param_range_svm = [0.01, 0.1, 1]
param_range_cut_left = range(50, 110, 5)
param_range_cut_right = [80, 100, 120, 150]
param_range_size_cube = [1, 3, 5, 10]
gs = GridSearchCV(estimator=pipe,
                  param_grid=[{
                      'cut__size_cubes': [5],
                      'cut__y1': [80],
                      'cut__x1': [50],
                      'cut__z1': [50],
                      'cut__x2': [120],
                      'cut__y2': [150],
                      'cut__z2': [100],
                      'clf__C': [0.1],
Ejemplo n.º 30
0
def pre_data(flag, type):
    if type == 'flow':
        dataset_b, dataset_w, dataset_t = data_read_flow()
    else:
        dataset_b, dataset_w, dataset_t = data_read()

    dataset = np.vstack((dataset_b, dataset_w, dataset_t))
    print(dataset.shape)
    # 前6000 训练集合,后4000测试集合
    time = []
    # 6-21
    payload = []
    # 22-34
    tcp_flag = []
    # 35-42
    cipher = []
    # subject,issue, certificate_time. self_signed, cipher_num(58), cipher(61) ,cipher_content_ratio(63) cipher_version
    speed = []
    # 43 - 50
    ip = []
    subject = []
    issue = []
    cipher_version = []
    label = []
    matrix = []
    flow = []

    # 65
    bitFre = []
    entropy = []
    cipher_bifFre = []
    cipher_entropy = []
    label_e = []
    for key in dataset:

        flow_one = []
        for j in range(0, 3):
            flow_one.append(float(key[j]))
        for j in range(4, 51):
            flow_one.append(float(key[j]))
        certificate_time = int(find_min((key[52])))
        # certificate_time
        self_signed = find_self_signed((key[51]))
        # 自签名
        flow_one.append(certificate_time)
        flow_one.append(self_signed)
        flow.append(flow_one)

        cipher_one = []
        cipher_one.append(key[58])
        cipher_one.append(key[61])
        cipher_one.append(key[63])

        time.append(key[6:22])
        tcp_flag.append(key[35:43])
        payload.append(key[22:35])
        speed.append(key[43:51])
        if key[-2] == 'black':
            label.append(1)
        elif key[-2] == 'white':
            label.append(0)
        else:
            label.append(0)
        ip.append(key[3])
        max_cip_version = 0
        for tem in key[-11]:
            try:
                if int(tem) > max_cip_version:
                    max_cip_version = int(tem)
            except ValueError:
                max_cip_version = -1

        cipher_version.append(max_cip_version)
        subject_one = Find_first(key[53])
        issue_one = Find_first(key[54])
        cipher_one.append(max_cip_version)

        if key[63] != 0:
            bitFre.append(key[65])
            entropy.append(key[66:70])
            cipher_bifFre.append(key[71])
            cipher_entropy.append(key[73:76])
            if key[-2] == 'black':
                label_e.append(1)
            else:
                label_e.append(0)

        subject.append(subject_one)
        issue.append(issue_one)

        cipher.append(cipher_one)

    ip_ans = oh_encoding(ip)
    subject_ans = oh_encoding(subject)
    issue_ans = oh_encoding(issue)

    cipher = np.hstack((cipher, subject_ans, issue_ans))

    mean_list = [8, 12, 16, 20, 23, 26, 29, 32]
    from sklearn.feature_selection import VarianceThreshold

    from sklearn.preprocessing import MinMaxScaler

    select = VarianceThreshold(threshold=0)
    dataset_flow = select.fit_transform(flow)
    minMax = MinMaxScaler()
    dataset_flow = minMax.fit_transform(dataset_flow)

    # dataset_mix = (np.hstack((flow, subject_ans, issue_ans, matrix)))
    # dataset_mix.append(list(dataset_flow[i]) + (list(issue_ans[i])) + list(subject_ans[i]))
    print("dataset is formed by {}".format(flag))
    ratio = len(dataset_b) + len(dataset_w)
    if flag == 'flow':
        return dataset_flow[:ratio], dataset_flow[
            ratio:], label[:ratio], label[ratio:]
    elif flag == 'subject':
        return subject_ans[:ratio], subject_ans[ratio:], label[:ratio], label[
            ratio:]
    elif flag == 'issue':
        return issue_ans[:ratio], issue_ans[ratio:], label[:ratio], label[
            ratio:]
    elif flag == 'matrix':
        return matrix[:ratio], matrix[ratio:], label[:ratio], label[ratio:]
    elif flag == 'payload':
        return payload[:ratio], payload[ratio:], label[:ratio], label[ratio:]
    elif flag == 'time':
        return time[:ratio], time[ratio:], label[:ratio], label[ratio:]
    elif flag == 'cipher':
        return cipher[:ratio], cipher[ratio:], label[:ratio], label[ratio:]
    elif flag == 'flag':
        return tcp_flag[:ratio], tcp_flag[ratio:], label[:ratio], label[ratio:]
    elif flag == 'speed':
        return speed[:ratio], speed[ratio:], label[:ratio], label[ratio:]
    elif flag == 'bitFre':
        return bitFre, label_e
    elif flag == 'entropy':
        return entropy, label_e
    elif flag == 'cipher_entropy':
        return cipher_entropy, label_e
    elif flag == 'cipher_bitFre':
        return cipher_bifFre, label_e
    else:
        print("select wrong")