def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler(with_mean=False) X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X)
def test_regressors_train(): estimators = all_estimators() regressors = [(name, E) for name, E in estimators if issubclass(E, RegressorMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses X = Scaler().fit_transform(X) y = Scaler().fit_transform(y) for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): reg = Reg() if hasattr(reg, 'alpha'): reg.set_params(alpha=0.01) # raises error on malformed input for fit assert_raises(ValueError, reg.fit, X, y[:-1]) # fit reg.fit(X, y) reg.predict(X) assert_greater(reg.score(X, y), 0.5)
def run_svm(svc,X): X = X.copy() scaler = Scaler() X = scaler.fit_transform(X) y_predict = svc.predict(X) return y_predict
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def test_scaler_without_copy(): """Check that Scaler.fit does not change input""" rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) X_copy = X.copy() Scaler(copy=False).fit(X) assert_array_equal(X, X_copy) X_csr_copy = X_csr.copy() Scaler(with_mean=False, copy=False).fit(X_csr) assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
def _pre_fit(self, X, y): random_state = check_random_state(self.random_state) if self.scale_y: self.y_scaler_ = Scaler(copy=True).fit(y) y = self.y_scaler_.transform(y) if self.metric == "precomputed": self.components_ = None n_components = X.shape[1] else: if self.init_components is None: if self.verbose: print "Selecting components..." self.components_ = select_components(X, y, self.n_components, random_state=random_state) else: self.components_ = self.init_components n_components = self.components_.shape[0] n_nonzero_coefs = self.n_nonzero_coefs if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1: n_nonzero_coefs = int(n_nonzero_coefs * n_components) n_nonzero_coefs = int(n_nonzero_coefs) if n_nonzero_coefs > n_components: raise AttributeError("n_nonzero_coefs cannot be bigger than " "n_components.") if self.verbose: print "Computing dictionary..." start = time.time() K = pairwise_kernels(X, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.verbose: print "Done in", time.time() - start, "seconds" if self.scale: if self.verbose: print "Scaling dictionary" start = time.time() copy = True if self.metric == "precomputed" else False self.scaler_ = Scaler(copy=copy) K = self.scaler_.fit_transform(K) if self.verbose: print "Done in", time.time() - start, "seconds" # FIXME: this allocates a lot of intermediary memory norms = np.sqrt(np.sum(K ** 2, axis=0)) return n_nonzero_coefs, K, y, norms
def test_scale_sparse_with_mean_raise_exception(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_csr = sp.csr_matrix(X) # check scaling and fit with direct calls on sparse data assert_raises(ValueError, scale, X_csr, with_mean=True) assert_raises(ValueError, Scaler(with_mean=True).fit, X_csr) # check transform and inverse_transform after a fit on a dense array scaler = Scaler(with_mean=True).fit(X) assert_raises(ValueError, scaler.transform, X_csr) X_transformed_csr = sp.csr_matrix(scaler.transform(X)) assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
def getPreparedData(): data = getData() attributes = data.columns # transformacia kategorickych parametrov na celociselne data['school'] = data['school'].apply(school_to_numeric) data['sex'] = data['sex'].apply(sex_to_numeric) data['address'] = data['address'].apply(address_to_numeric) data['famsize'] = data['famsize'].apply(famsize_to_numeric) data['Pstatus'] = data['Pstatus'].apply(Pstatus_to_numeric) data['Mjob'] = data['Mjob'].apply(job_to_numeric) data['Fjob'] = data['Fjob'].apply(job_to_numeric) data['reason'] = data['reason'].apply(reason_to_numeric) data['guardian'] = data['guardian'].apply(guardian_to_numeric) data['schoolsup'] = data['schoolsup'].apply(yesno_to_numeric) data['famsup'] = data['famsup'].apply(yesno_to_numeric) data['paid'] = data['paid'].apply(yesno_to_numeric) data['activities'] = data['activities'].apply(yesno_to_numeric) data['nursery'] = data['nursery'].apply(yesno_to_numeric) data['higher'] = data['higher'].apply(yesno_to_numeric) data['internet'] = data['internet'].apply(yesno_to_numeric) data['romantic'] = data['romantic'].apply(yesno_to_numeric) # transformacia vsetkych dat do intervalu [0,1] scaler = Scaler(feature_range=(0, 1)).fit(data) data = pd.DataFrame(scaler.transform(data), columns=attributes) return np.array(data)
def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in ((Scaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2)
def cluster(playlist): arq = 'Total ' + playlist + '.csv' n_clusters = 0 Full_data = pd.read_csv(arq) Full_data = Full_data.dropna(axis=1, how='all') Full_data = Full_data.dropna(axis=0, how='any') ID = Full_data['id'] Mode = Full_data['mode'] length = Full_data['duration_ms'] artist = Full_data['artist'] Full_data = Full_data.drop( columns=['track', 'album_id', 'artist', 'id', 'mode']) Fdata = Full_data.values scaler = Scaler() data_u = scaler.fit_transform(Fdata) # pca_transf = PCA(0.8) # PCA_data = pca_transf.fit_transform(data_u) clusterer = AffinityPropagation(random_state=None, preference=-500) # clusterer = HDBSCAN(min_cluster_size=20) # clusterer = MeanShift() labels = clusterer.fit_predict(data_u) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) labels.shape = (len(labels), 1) Full_data['cluster'] = labels + 1 Full_data['id'] = ID Full_data['mode'] = Mode Full_data['artist'] = artist Full_data['duration_ms'] = length # Full_data.sort_values(by='cluster') Full_data.to_csv('clustered.csv', index=False) # sns.pairplot(Full_data, hue="cluster", palette='YlGnBu') # plt.show() return n_clusters
def cluster(playlist): arq = 'Total ' + playlist + '.csv' n_clusters = 0 Full_data = pd.read_csv(arq) Full_data = Full_data.dropna(axis=1, how='all') Full_data = Full_data.dropna(axis=0, how='any') ID = Full_data['id'] Mode = Full_data['mode'] length = Full_data['duration_ms'] artist = Full_data['artist'] key = Full_data['key'] time_signature = Full_data['time_signature'] Full_data = Full_data.drop(columns=[ 'track', 'album_id', 'artist', 'artist_id', 'id', 'mode', 'duration_ms', 'key', 'time_signature' ]) Fdata = Full_data.values scaler = Scaler() data_u = scaler.fit_transform(Fdata) clusterer = AgglomerativeClustering(n_clusters=None, distance_threshold=35) labels = clusterer.fit_predict(data_u) score = silhouette_score(data_u, labels) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) labels.shape = (len(labels), 1) Full_data['cluster'] = labels + 1 Full_data['id'] = ID Full_data['mode'] = Mode Full_data['artist'] = artist Full_data['duration_ms'] = length Full_data['key'] = key Full_data['time_signature'] = time_signature Full_data.to_csv('clustered.csv', index=False) return n_clusters, score
def test_pipeline_methods_preprocessing_svm(): """Test the various methods of the pipeline (preprocessing + svm).""" iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = Scaler() pca = RandomizedPCA(n_components=2, whiten=True) clf = SVC(probability=True) for preprocessing in [scaler, pca]: pipe = Pipeline([('scaler', scaler), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def data_to_kernels(tr_data, te_data): scaler = Scaler(copy=False) scaler.fit_transform(tr_data) #tr_data, mu, sigma = standardize(tr_data) tr_data = power_normalize(tr_data, 0.5) tr_data = L2_normalize(tr_data) #te_data, _, _ = standardize(te_data, mu, sigma) scaler.transform(te_data) te_data = power_normalize(te_data, 0.5) te_data = L2_normalize(te_data) tr_kernel = np.dot(tr_data, tr_data.T) te_kernel = np.dot(te_data, tr_data.T) return tr_kernel, te_kernel
def dimension(fnames, fp='ECFP', alg='PCA', maximum=int(1e5), ref='GPCR'): df = pd.DataFrame() for i, fname in enumerate(fnames): sub = pd.read_table(fname).dropna(subset=['Smiles']) sub = sub[sub.VALID == True] if maximum is not None and len(sub) > maximum: sub = sub.sample(maximum) if ref not in fname: sub = sub[sub.DESIRE == True] sub = sub.drop_duplicates(subset='Smiles') sub['LABEL'] = i df = df.append(sub) if fp == 'similarity': ref = df[(df.LABEL == 0) & (df.DESIRE == True)] refs = Predictor.calc_ecfp(ref.Smiles) fps = Predictor.calc_ecfp(df.Smiles) from rdkit.Chem import DataStructs fps = np.array( [DataStructs.BulkTanimotoSimilarity(fp, refs) for fp in fps]) else: fp_alg = Predictor.calc_ecfp if fp == 'ECFP' else Predictor.calc_physchem fps = fp_alg(df.Smiles) fps = Scaler().fit_transform(fps) pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2) xy = pca.fit_transform(fps) df['X'], df['Y'] = xy[:, 0], xy[:, 1] if alg == 'PCA': ratio = pca.explained_variance_ratio_[:2] return df, ratio else: return df
def utl_scaleImpute(X_data, p_imputeColumns, p_scaleColumns, p_scalers = None): X_data = X_data.copy() v_imputeColumns = [column for column in p_imputeColumns if column in X_data.columns] v_scaleColumns = [column for column in p_scaleColumns if column in X_data.columns] for column in v_imputeColumns: v_values = X_data[column].astype(float).values.reshape(-1, 1) if np.isnan(v_values).all(): X_data[column] = -999 else: try: X_data[column] = Imputer(strategy = 'mean', axis = 0).fit_transform(v_values) except: values = [np.unique(-999 if np.isnan(ll).all() else ll) for ll in v_values.reshape(1, -1)] print(column, values) raise if p_scalers is None: p_scalers = {} for column in v_scaleColumns: v_values = X_data[column].value_counts(dropna = True).index.tolist() p_scalers[column] = Scaler().fit(np.array(v_values).reshape(-1, 1)) for column in v_scaleColumns: X_data[column] = p_scalers[column].transform(X_data[column].values.reshape(-1, 1) ) return X_data, p_scalers
def __init__(self, env): self.env = env # sampleing envrionment state in order to featurize it. observation_examples = np.array([self.env.observation_space.sample() for x in range(10000)]) # Feature Preprocessing: Normalize to zero mean and unit variance # We use a few samples from the observation space to do this self.scaler = Scaler() self.scaler.fit(observation_examples) # Used to convert a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space self.featurizer = FeatureUnion([ ("rbf1", RBF(gamma=5.0, n_components=100)), ("rbf2", RBF(gamma=2.0, n_components=100)), ("rbf3", RBF(gamma=1.0, n_components=100)), ("rbf4", RBF(gamma=0.5, n_components=100)) ]) self.featurizer.fit(self.scaler.transform(observation_examples)) # action model for SGD regressor self.action_models = [] self.nA = self.env.action_space.n for na in range(self.nA): model = SGD(learning_rate="constant") model.partial_fit([self.__featurize_state(self.env.reset())], [0]) self.action_models.append(model)
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) estimators = all_estimators() regressors = [(name, E) for name, E in estimators if issubclass(E, RegressorMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) X = Scaler().fit_transform(X) y = np.random.randint(2, size=X.shape[0]) for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds reg1 = Reg() reg2 = Reg() if hasattr(reg1, 'alpha'): reg1.set_params(alpha=0.01) reg2.set_params(alpha=0.01) if hasattr(reg1, 'random_state'): reg1.set_params(random_state=0) reg2.set_params(random_state=0) # fit reg1.fit(X, y) pred1 = reg1.predict(X) reg2.fit(X, y.astype(np.float)) pred2 = reg2.predict(X) assert_array_almost_equal(pred1, pred2, 2)
def load_data(path, label_encoder): data = { "train_o": [], "train_i": [], "val_o": [], "val_i": [], "test_o": [], "test_i": [] } for data_type in ["train", "val", "test"]: csv_files_path = list( paths.list_files(os.path.join(path, data_type), ".csv")) for csv_path in tqdm(csv_files_path): df = pd.read_csv(csv_path) df = df["close"] input_data = df.values input_data = Scaler().fit_transform(input_data.reshape( -1, 1)).flatten() output_data = os.path.normpath(csv_path) output_data = output_data.split(os.path.sep)[-2] data[f"{data_type}_o"].append(label_encoder[output_data]) data[f"{data_type}_i"].append(input_data) for key in data: data[key] = np.asarray(data[key]) return data
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) X = Scaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True) as w: clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78)
def reorder(data): F = bm.TSPCF scaler = Scaler() c_data = data.drop( columns=['cluster', 'id', 'artist', 'duration_ms', 'time_signature']) c_data = scaler.fit_transform(c_data) dist = distance_matrix(c_data, c_data) M = 2 * np.max(dist) np.fill_diagonal(dist, M) L = len(c_data) limite = [L * [0], L * [(L - 1)]] p = L * [1] A, B, C, D = PSO.run(F, 1000, limite, 5, 10, dist, passo=p, permut=True) edge_lengths = [dist[-1][0]] for i in range(len(A) - 1): d = dist[i][i + 1] edge_lengths.append(d) m_d = max(edge_lengths) M = edge_lengths.index(m_d) A = np.append(A[M:], A[:M]) B -= m_d try: L_range = list(range(len(A))) N_order = [list(A).index(val) for val in L_range] data['new_order'] = N_order data = data.sort_values(by='new_order') dataset = data.drop(columns=['new_order']) try: dataset = data.drop(columns=['Unnamed: 0']) except: pass return dataset except: return data
def process_data(self): test = pandas.read_csv("test.csv") testMat = test.as_matrix() train = pandas.read_csv("train.csv") trainMat = train.as_matrix() trainResult = trainMat[:, 0] trainMat = trainMat[:, 1:] # trainInd = np.where(trainResult == 0)[0] # how_many = (trainResult == 1).sum() - len(trainInd) # np.random.shuffle(trainInd) # addedResult = trainResult[trainInd[:how_many],:] # addedData = trainMat[trainInd[:how_many],:] # trainResult = np.append(trainResult,addedResult) # trainMat = np.vstack((trainMat,addedData)) cv = StratifiedKFold(trainResult, 2) # cv = KFold(n=trainResult.shape[0],k=2) reduceFeatures = ExtraTreesClassifier( compute_importances=True, random_state=1234, n_jobs=self.cpus, n_estimators=1000, criterion="gini" ) reduceFeatures.fit(trainMat, trainResult) trainScaler = Scaler() self.cv_data = [] self.cv_data_nonreduced = [] for train, test in cv: X_train, X_test, Y_train, Y_test = ( trainMat[train, :], trainMat[test, :], trainResult[train, :], trainResult[test, :], ) X_train = trainScaler.fit_transform(X_train) X_test = trainScaler.transform(X_test) self.cv_data_nonreduced.append((X_train, X_test, Y_train, Y_test)) X_train = reduceFeatures.transform(X_train) X_test = reduceFeatures.transform(X_test) self.cv_data.append((X_train, X_test, Y_train, Y_test)) testMat = trainScaler.transform(testMat) self.testMat_nonreduced = testMat self.testMat = reduceFeatures.transform(testMat) allData = self.testMat, self.cv_data, self.testMat_nonreduced, self.cv_data_nonreduced data_handle = open("allData.pkl", "w") pickle.dump(allData, data_handle) data_handle.close()
def get_sl_test_data(fileEvents,fileLabels,includedChannels,useMeans=False,parentIndices=None): ## declare variables X = fileEvents[:,includedChannels].copy() scaler = Scaler() X = scaler.fit_transform(X) #if parentIndices != None: # X = X[parentIndices,:] #X = (X - X.mean(axis=0)) / X.std(axis=0) if useMeans == True: clusterIds,X = get_mean_matrix(X,fileLabels) #X = (X - X.mean(axis=0)) / X.std(axis=0) return clusterIds,X return X
def __init__(self, set_name='AWA1', batch_size=256, **kwargs): super().__init__(set_name, batch_size, **kwargs) self.content = [ 'feat', 'label', 'label_emb', 's_cls', 'u_cls', 'cls_emb', 's_cls_emb', 'u_cls_emb' ] self.parts = ['training', 'seen', 'unseen'] self.scaler = Scaler() self._init_scaler()
def Classifier(): scaler = Scaler() es = callbacks.EarlyStopping( monitor="val_loss", min_delta=0, patience=500, verbose=0, mode="auto", baseline=None, restore_best_weights=True, ) mc = tf.keras.callbacks.ModelCheckpoint( 'best_model.h5', monitor="val_accuracy", verbose=0, save_best_only=True, save_weights_only=False, mode="auto", save_freq="epoch", ) in_data = 'clustered.csv' data = pd.read_csv(in_data) data = data.dropna() id = data['id'] data = data.drop(columns=['id']) step = 1 train_label = data['cluster'] train_data = data.drop(columns=['cluster', 'artist']) train_data = train_data.fillna(0) train_label = train_label.fillna(0) train_data = scaler.fit_transform(train_data) n_layers = 2 model = Sequential() model.add(Dense(64, input_dim=len(train_data[0]), activation='relu')) for _ in range(n_layers): model.add(Dense(64, activation='relu')) model.add(Dropout(0.2)) model.add(Dense((1 + np.max(train_label.values)), activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(train_data, train_label, epochs=1000, batch_size=50, verbose=2, callbacks=[es, mc], validation_split=0.2, shuffle=True) model.load_weights('best_model.h5') probs = model.predict(train_data) predictions = np.argmax(probs, axis=-1) P = pd.DataFrame(probs) dp = pd.DataFrame(predictions, columns=['predicted_cluster']) dp['predicted_prob'] = np.max(probs, axis=-1) P.to_csv('Song_Probs.csv', index=False) dp.to_csv('Song_preds.csv', index=False)
def test_transformers(): # test if transformers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() transformers = [(name, E) for name, E in estimators if issubclass(E, TransformerMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) X, y = X[:10], y[:10] n_samples, n_features = X.shape X = Scaler().fit_transform(X) X -= X.min() for name, Trans in transformers: if Trans in dont_test or Trans in meta_estimators: continue # these don't actually fit the data: if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]: continue # catch deprecation warnings with warnings.catch_warnings(record=True): trans = Trans() if hasattr(trans, 'compute_importances'): trans.compute_importances = True if Trans is SelectKBest: # SelectKBest has a default of k=10 # which is more feature than we have. trans.k = 1 # fit trans.fit(X, y) X_pred = trans.fit_transform(X, y=y) assert_equal(X_pred.shape[0], n_samples) if hasattr(trans, 'transform'): X_pred2 = trans.transform(X) assert_array_almost_equal(X_pred, X_pred2, 2) # raises error on malformed input for transform assert_raises(ValueError, trans.transform, X.T)
def test_regressors_train(): estimators = all_estimators() regressors = [(name, E) for name, E in estimators if issubclass(E, RegressorMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses X = Scaler().fit_transform(X) y = Scaler().fit_transform(y) succeeded = True for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): reg = Reg() if hasattr(reg, 'alpha'): reg.set_params(alpha=0.01) # raises error on malformed input for fit assert_raises(ValueError, reg.fit, X, y[:-1]) # fit try: if Reg in (_PLS, PLSCanonical, PLSRegression, CCA): y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))]) y_ = y_.T else: y_ = y reg.fit(X, y_) reg.predict(X) if Reg not in (PLSCanonical, CCA): # TODO: find out why assert_greater(reg.score(X, y_), 0.5) except Exception as e: print(reg) print e print succeeded = False assert_true(succeeded)
def test_center_kernel(): """Test that KernelCenterer is equivalent to Scaler in feature space""" X_fit = np.random.random((5, 4)) scaler = Scaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = np.random.random((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) n_samples, n_features = X.shape n_labels = len(np.unique(y)) X = Scaler().fit_transform(X) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True) as w: clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) assert_equal(decision.shape, (n_samples, n_labels)) if not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_labels)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass
def single_task(feat, alg='RF', reg=False, is_extra=True): df = pd.read_table('data/LIGAND_RAW.tsv').dropna(subset=pair[1:2]) df = df[df[pair[0]] == feat] df = df[pair].set_index(pair[1]) year = df[pair[-1:]].groupby(pair[1]).min().dropna() test = year[year[pair[-1]] > 2015].index numery = df[pair[2]].groupby(pair[1]).mean().dropna() comments = df[(df.Comment.str.contains('Not Active') == True)] inhibits = df[(df.Standard_Type == 'Inhibition') & df.Standard_Relation.isin(['<', '<='])] relations = df[df.Standard_Type.isin(['EC50', 'IC50', 'Kd', 'Ki']) & df.Standard_Relation.isin(['>', '>='])] binary = pd.concat([comments, inhibits, relations], axis=0) binary = binary[~binary.index.isin(numery.index)] binary[pair[2]] = 3.99 binary = binary[pair[2]].groupby(binary.index).first() df = numery.append(binary) if is_extra else numery if not reg: df = (df > th).astype(float) df = df.sample(len(df)) print(feat, len(numery[numery >= th]), len(numery[numery < th]), len(binary)) test_ix = set(df.index).intersection(test) test = df.loc[test_ix].dropna() data = df.drop(test.index) test_x = utils.Predictor.calc_fp( [Chem.MolFromSimles(mol) for mol in test.index]) data_x = utils.Predictor.calc_fp( [Chem.MolFromSimles(mol) for mol in data.index]) out = 'output/single/%s_%s_%s' % (alg, 'REG' if reg else 'CLS', feat) if alg != 'RF': scaler = Scaler() scaler.fit(data_x) test_x = scaler.transform(test_x) data_x = scaler.transform(data_x) else: X = np.concatenate([data_x, test_x], axis=0) y = np.concatenate([data.values, test.values], axis=0) Train_RF(X, y[:, 0], out=out + '.pkg', reg=reg) data, test = data.to_frame(name='Label'), test.to_frame(name='Label') data['Score'], test['Score'] = cross_validation(data_x, data.values, test_x, test.values, alg, out, reg=reg) data.to_csv(out + '.cv.tsv', sep='\t') test.to_csv(out + '.ind.tsv', sep='\t')
def __init__(self): """ Model initialization """ self.model = Pipeline( steps=[('imputer', Imputer(strategy='median')), ('feature_union', FeatureUnion( n_jobs=1, transformer_list=[ ('band_1_pipe', Pipeline(steps=[ ('band_1', FunctionTransformer(self.band1)), ('band_1_standard_scale', Scaler()), ])), ('band_2_pipe', Pipeline(steps=[ ('band_2', FunctionTransformer(self.band2)), ('band_2_standard_scale', Scaler()), ])), ('inc_angle', FunctionTransformer(self.angle)) ])), ('xgboost', XGBClassifier(n_estimators=100))])
def sparser(dataset): d = dataset.drop(columns=['id', 'artist', 'cluster', 'probs']) scaler = Scaler() scaled_d = scaler.fit_transform(d) # scaled_d = scaled_d[-20:] dist = np.triu(spy.distance_matrix(scaled_d, scaled_d)) scaled_dist = scaler.fit_transform(dist) arg = np.max(scaled_dist) indexes = np.argwhere(dist > (.25 * arg)) I = indexes.T keep = list(set(I[0])) sparse_d = dataset.iloc[keep, :] sparse_d = sparse_d.sort_values('probs', ascending=False) return sparse_d
def test_pipeline_methods_scaler_svm(): """Test the various methods of the pipeline (scaler + svm).""" iris = load_iris() X = iris.data y = iris.target # Test with Scaler + SVC clf = SVC(probability=True) scaler = Scaler() pipe = Pipeline([('scaler', scaler), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_regressors_int(): estimators = all_estimators() clustering = [(name, E) for name, E in estimators if issubclass(E, ClusterMixin)] boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) # TODO: test with intercept # TODO: test with multiple responses X_m = Scaler().fit_transform(X_m) X = Scaler().fit_transform(X) succeeded = True for name, Reg in regressors: if Reg in dont_test or Reg in meta_estimators: continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds reg1 = Reg() reg2 = Reg() if hasattr(reg1, 'random_state'): reg1.set_params(random_state=0) reg2.set_params(random_state=0)
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def run_real_data_experiments(nr_samples, delta, verbose=0, do_scatter_plot=False): dataset = Dataset('hollywood2', suffix='.per_slice.delta_%d' % delta, nr_clusters=256) samples, _ = dataset.get_data('test') nr_samples = np.minimum(len(samples), nr_samples) nr_samples = np.maximum(1, nr_samples) if verbose > 2: print "Loading train data." tr_data, _, _ = load_sample_data(dataset, 'train', pi_derivatives=True) scaler = Scaler() scaler.fit(tr_data) true_values, approx_values = [], [] for ii in xrange(nr_samples): if verbose > 2: sys.stdout.write("%s\r" % samples[ii].movie) data, _, _ = load_sample_data(dataset, str(samples[ii]), pi_derivatives=True) data = scaler.transform(data) L2_norm_true, L2_norm_approx = L2_approx(data) true_values.append(L2_norm_true) approx_values.append(L2_norm_approx) if verbose: print print_info(true_values, approx_values, verbose) print if do_scatter_plot: scatter_plot(true_values, approx_values)
def processData(filename, split, trainingSet, testSet): DATASET_PATH = './' data_path = os.path.join(DATASET_PATH, filename) dataset = pd.read_csv(data_path, header=None) dataset.columns = [ "Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome" ] median_bmi = dataset['BMI'].median() dataset['BMI'] = dataset['BMI'].replace(to_replace=0, value=median_bmi) median_bloodp = dataset['BloodPressure'].median() dataset['BloodPressure'] = dataset['BloodPressure'].replace( to_replace=0, value=median_bloodp) median_plglcconc = dataset['Glucose'].median() dataset['Glucose'] = dataset['Glucose'].replace(to_replace=0, value=median_plglcconc) median_skinthick = dataset['SkinThickness'].median() dataset['SkinThickness'] = dataset['SkinThickness'].replace( to_replace=0, value=median_skinthick) median_twohourserins = dataset['Insulin'].median() dataset['Insulin'] = dataset['Insulin'].replace(to_replace=0, value=median_twohourserins) # print(dataset.head()) # print(dataset['BMI'][0]) scaler = Scaler() scaler.fit(dataset) scaled_dataset = scaler.transform(dataset) # print(dataset['BMI'][0]) df = pd.DataFrame(data=scaled_dataset) # print(df.head()) # datasetlst = pd. # datasetlst = list(dataset) datasetlst = df.values.tolist() # map(datasetlst,dataset.values) # datasetlst = list(dataset.values) # print(datasetlst[0][5]) print(len(datasetlst)) for x in range(0, len(datasetlst) - 1): for y in range(9): datasetlst[x][y] = float(datasetlst[x][y]) if random.random() < split: trainingSet.append(datasetlst[x]) else: testSet.append(datasetlst[x])
def run_svm_validation(X1,y1,X2,y2,gammaRange=[0.5],cRange=[0.005],useLinear=False): #X_train,y_train,X_test,y_test = split_train_test(X1,y1,X2,y2) X = np.vstack((X1, X2)) Y = np.hstack((y1, y2)) scaler = Scaler() X = scaler.fit_transform(X) #if useLinear == True: # svc = svm.SVC(kernel='linear')#class_weight={1: 10 # # # #svc = svm.SVC(kernel='poly',degree=3,C=1.0) # svc.fit(X, Y) # return svc C_range = 10.0 ** np.arange(-2, 9) gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(class_weight={1: 100}), param_grid=param_grid, cv=StratifiedKFold(y=Y,k=2)) grid.fit(X, Y) print("The best classifier is: ", grid.best_estimator_) return grid.best_estimator_
def SVM_fit(X_in, y_in, X_out, gamma, C): M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) shuffle(X_in, y_in) X_test = [X_in[i] for i in test_indices] y_test = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_test, y_test) X_in = scaler.transform(X_in) X_test = scaler.transform(X_test) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_test.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_test = [[X_test[i][j] for j in f_indices] for i in range(len(X_test))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered svc = svm.SVC(kernel='rbf', C=C, gamma=gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) y_out = svc.predict(X_out) return y_out
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) scaler = Scaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = Scaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_scaled_back, X)
def test_scaler(): """Test scaling of dataset along all axis""" # First test with 1D data X = np.random.randn(5) X_orig_copy = X.copy() scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # Test with 2D data X = np.random.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X X_scaled = scaler.fit(X).transform(X, copy=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is X X = np.random.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X
def tree_train(X_in, y_in, X_out, min_meaningful_features_ratio=1., file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(f_indices) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_features_number = M for features_number in range(int(floor(M * min_meaningful_features_ratio)), M + 1): # kfold = cross_validation.KFold(len(y_in), k=10, shuffle=True) kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = ExtraTreesClassifier(criterion='entropy', max_features=features_number) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([[X_in[i][j] for j in range(M)] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([[X_in[i][j] for j in range(M)] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('# of features: {}\n'.format(len(X_train[0]))) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_features_number = features_number best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, t={}\n'.format(1. - best_cv_accuracy, best_features_number)) svc = ExtraTreesClassifier(criterion='entropy', n_estimators=features_number) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scaler, y_scaler))) y_out = svc.predict(X_out) return y_out
def Logistic_train(X_in, y_in, X_out, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(X_in[0]) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_c = 0. for c in cs: kfold = cross_validation.StratifiedKFold(y_in, k=10) lrc = LogisticRegression(C=c, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] lrc.fit(X_train, y_train) in_accuracy += lrc.score(X_train, y_train) cv_accuracy += lrc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C: {}\n'.format(c)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_c = c best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) lrc = LogisticRegression(C=best_c, tol=1e-5) lrc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler))) y_out = lrc.predict(X_out) return y_out
def SVM_train(X_in, y_in, X_out, gammas, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breaks the input set into train. cross validation #and scale sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests scale_set_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) # shuffle(X_in, y_in) X_scale = [X_in[i] for i in scale_set_indices] y_scale = [y_in[i] for i in scale_set_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #Scale data first scaler = Scaler(copy=False) #WARNING: copy=False => in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on a separate subsetonly, and then reported on data scaler.fit(X_scale, y_scale) X_scale = scaler.transform(X_scale) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scale.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scale = [[X_scale[i][j] for j in f_indices] for i in range(len(X_scale))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] if file_log: file_log.writelines('Initial features :{}, Features used: {}\n'.format(M, len(X_in[0]))) M = len(f_indices) best_cv_accuracy = 0. best_gamma = 0. best_c = 0. #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered for c in cs: for g in gammas: #Balanced cross validation (keeps the ratio of the two classes as #constant as possible across the k folds). kfold = cross_validation.StratifiedKFold(y_in, k=10) svc = svm.SVC(kernel='rbf', C=c, gamma=g, verbose=False, cache_size=4092, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] svc.fit(X_train, y_train) in_accuracy += svc.score(X_train, y_train) cv_accuracy += svc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C:{}, gamma:{}\n'.format(c, g)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_gamma = g best_c = c best_cv_accuracy = cv_accuracy if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}, gamma={}\n'.format(1. - best_cv_accuracy, best_c, best_gamma)) svc = svm.SVC(kernel='rbf', C=best_c, gamma=best_gamma, verbose=False, cache_size=4092, tol=1e-5) svc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - svc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - svc.score(X_scale, y_scale))) y_out = svc.predict(X_out) #DEBUG: output = ['{} {:+}\n'.format(id_out[i], int(y_scale[i])) for i in range(len(X_out))] #DEBUG: file_log.writelines('------------------------') return y_out
class KMPBase(BaseEstimator): def __init__(self, n_nonzero_coefs=0.3, loss=None, # components (basis functions) init_components=None, n_components=None, check_duplicates=False, scale=False, scale_y=False, # back-fitting n_refit=5, estimator=None, # metric metric="linear", gamma=0.1, coef0=1, degree=4, # validation X_val=None, y_val=None, n_validate=1, epsilon=0, score_func=None, # misc random_state=None, verbose=0, n_jobs=1): if n_nonzero_coefs < 0: raise AttributeError("n_nonzero_coefs should be > 0.") self.n_nonzero_coefs = n_nonzero_coefs self.loss = loss self.init_components = init_components self.n_components = n_components self.check_duplicates = check_duplicates self.scale = scale self.scale_y = scale_y self.n_refit = n_refit self.estimator = estimator self.metric = metric self.gamma = gamma self.coef0 = coef0 self.degree = degree self.X_val = X_val self.y_val = y_val self.n_validate = n_validate self.epsilon = epsilon self.score_func = score_func self.random_state = random_state self.verbose = verbose self.n_jobs = n_jobs def _kernel_params(self): return {"gamma" : self.gamma, "degree" : self.degree, "coef0" : self.coef0} def _get_estimator(self): if self.estimator is None: estimator = LinearRegression() else: estimator = clone(self.estimator) estimator.fit_intercept = False return estimator def _get_loss(self): if self.loss == "squared": return SquaredLoss() else: return None def _pre_fit(self, X, y): random_state = check_random_state(self.random_state) if self.scale_y: self.y_scaler_ = Scaler(copy=True).fit(y) y = self.y_scaler_.transform(y) if self.metric == "precomputed": self.components_ = None n_components = X.shape[1] else: if self.init_components is None: if self.verbose: print "Selecting components..." self.components_ = select_components(X, y, self.n_components, random_state=random_state) else: self.components_ = self.init_components n_components = self.components_.shape[0] n_nonzero_coefs = self.n_nonzero_coefs if 0 < n_nonzero_coefs and n_nonzero_coefs <= 1: n_nonzero_coefs = int(n_nonzero_coefs * n_components) n_nonzero_coefs = int(n_nonzero_coefs) if n_nonzero_coefs > n_components: raise AttributeError("n_nonzero_coefs cannot be bigger than " "n_components.") if self.verbose: print "Computing dictionary..." start = time.time() K = pairwise_kernels(X, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.verbose: print "Done in", time.time() - start, "seconds" if self.scale: if self.verbose: print "Scaling dictionary" start = time.time() copy = True if self.metric == "precomputed" else False self.scaler_ = Scaler(copy=copy) K = self.scaler_.fit_transform(K) if self.verbose: print "Done in", time.time() - start, "seconds" # FIXME: this allocates a lot of intermediary memory norms = np.sqrt(np.sum(K ** 2, axis=0)) return n_nonzero_coefs, K, y, norms def _fit_multi(self, K, y, Y, n_nonzero_coefs, norms): if self.verbose: print "Starting training..." start = time.time() coef = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_run_iterator)(self._get_estimator(), self._get_loss(), K, Y[:, i], n_nonzero_coefs, norms, self.n_refit, self.check_duplicates) for i in xrange(Y.shape[1])) self.coef_ = np.array(coef) if self.verbose: print "Done in", time.time() - start, "seconds" def _score(self, y_true, y_pred): if self.score_func == "auc": return auc(y_true, y_pred) if hasattr(self, "lb_"): y_pred = self.lb_.inverse_transform(y_pred, threshold=0.5) if self.score_func is None: return np.mean(y_true == y_pred) else: return self.score_func(y_true, y_pred) else: # FIXME: no need to ravel y_pred if y_true is 2d! return -np.mean((y_true - y_pred.ravel()) ** 2) def _fit_multi_with_validation(self, K, y, Y, n_nonzero_coefs, norms): iterators = [FitIterator(self._get_estimator(), self._get_loss(), K, Y[:, i], n_nonzero_coefs, norms, self.n_refit, self.check_duplicates, self.verbose) for i in xrange(Y.shape[1])] if self.verbose: print "Computing validation dictionary..." start = time.time() K_val = pairwise_kernels(self.X_val, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.verbose: print "Done in", time.time() - start, "seconds" if self.scale: K_val = self.scaler_.transform(K_val) y_val = self.y_val if self.scale_y: y_val = self.y_scaler_.transform(y_val) if self.verbose: print "Starting training..." start = time.time() best_score = -np.inf validation_scores = [] training_scores = [] iterations = [] for i in xrange(1, n_nonzero_coefs + 1): iterators = [it.next() for it in iterators] #iterators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( #delayed(_run_iterator)(it) for it in iterators) coef = np.array([it.coef_ for it in iterators]) y_train_pred = np.array([it.y_train_ for it in iterators]).T if i % self.n_validate == 0: if self.verbose >= 2: print "Validating %d/%d..." % (i, n_nonzero_coefs) y_val_pred = np.dot(K_val, coef.T) validation_score = self._score(y_val, y_val_pred) training_score = self._score(y, y_train_pred) if validation_score > best_score: self.coef_ = coef.copy() best_score = np.abs(validation_score) validation_scores.append(np.abs(validation_score)) training_scores.append(np.abs(training_score)) iterations.append(i) if len(iterations) > 2 and self.epsilon > 0: diff = (validation_scores[-1] - validation_scores[-2]) diff /= validation_scores[0] if abs(diff) < self.epsilon: if self.verbose: print "Converged at iteration", i break self.validation_scores_ = np.array(validation_scores) self.training_scores_ = np.array(training_scores) self.iterations_ = np.array(iterations) self.best_score_ = best_score if self.verbose: print "Done in", time.time() - start, "seconds" def _fit(self, K, y, Y, n_nonzero_coefs, norms): if self.X_val is not None and self.y_val is not None: meth = self._fit_multi_with_validation else: meth = self._fit_multi meth(K, y, Y, n_nonzero_coefs, norms) def _post_fit(self): if self.metric != "precomputed": used_basis = np.sum(self.coef_ != 0, axis=0, dtype=bool) self.coef_ = self.coef_[:, used_basis] self.components_ = self.components_[used_basis] def decision_function(self, X): K = pairwise_kernels(X, self.components_, metric=self.metric, filter_params=True, n_jobs=self.n_jobs, **self._kernel_params()) if self.scale: K = self.scaler_.transform(K) pred = np.dot(K, self.coef_.T) if self.scale_y: pred = self.y_scaler_.inverse_transform(pred) return pred
def test_transformers(): # test if transformers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() transformers = [(name, E) for name, E in estimators if issubclass(E, TransformerMixin)] X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = Scaler().fit_transform(X) X -= X.min() succeeded = True for name, Trans in transformers: if Trans in dont_test or Trans in meta_estimators: continue # these don't actually fit the data: if Trans in [AdditiveChi2Sampler, Binarizer, Normalizer]: continue # catch deprecation warnings with warnings.catch_warnings(record=True): trans = Trans() if hasattr(trans, 'compute_importances'): trans.compute_importances = True if Trans is SelectKBest: # SelectKBest has a default of k=10 # which is more feature than we have. trans.k = 1 # fit if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD): y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))]) y_ = y_.T else: y_ = y try: trans.fit(X, y_) X_pred = trans.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) except Exception as e: print trans print e print succeeded = False if hasattr(trans, 'transform'): if Trans in (_PLS, PLSCanonical, PLSRegression, CCA, PLSSVD): X_pred2 = trans.transform(X, y_) else: X_pred2 = trans.transform(X) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2 in zip(X_pred, X_pred2): assert_array_almost_equal(x_pred, x_pred2, 2, "fit_transform not correct in %s" % Trans) else: assert_array_almost_equal(X_pred, X_pred2, 2, "fit_transform not correct in %s" % Trans) # raises error on malformed input for transform assert_raises(ValueError, trans.transform, X.T) assert_true(succeeded)
coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features) # The correlation of our design: variables correlated by blocs of 3 corr = np.zeros((n_features, n_features)) for i in range(0, n_features, block_size): corr[i:i + block_size, i:i + block_size] = 1 - conditionning corr.flat[::n_features + 1] = 1 corr = linalg.cholesky(corr) # Our design X = rng.normal(size=(n_samples, n_features)) X = np.dot(X, corr) # Keep [Wainwright2006] (26c) constant X[:n_relevant_features] /= np.abs( linalg.svdvals(X[:n_relevant_features])).max() X = Scaler().fit_transform(X.copy()) # The output variable y = np.dot(X, coef) y /= np.std(y) # We scale the added noise as a function of the average correlation # between the design and the output variable y += noise_level * rng.normal(size=n_samples) mi = mutual_incoherence(X[:, :n_relevant_features], X[:, n_relevant_features:]) ########################################################################### # Plot stability selection path, using a high eps for early stopping # of the path, to save computation time alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42, eps=0.05)
def normalize(self, data, n=N_COMPONENTS): X=np.array(data, dtype='float') #X=np.array(X[:,np.std(X,0)!=0.0], dtype='float') scaler=Scaler() Xnorm=scaler.fit_transform(X) return Xnorm
if folding == "stratified": cv = StratifiedKFold(y, k=n_folds) elif folding == "kfolding": cv = KFold(n=y.shape[0], k=n_folds) elif folding == "leaveoneout": n_folds[0] = y.shape[0] cv = LeaveOneOut(n=y.shape[0]) else: print("unknown crossvalidation method!") # -- classifier clf = svm.SVC(kernel="linear", probability=True, C=svm_C) # -- normalizer scaler = Scaler() # -- feature selection fs = SelectPercentile(f_classif, percentile=fs_n) print("INITIALIZE RESULTS") if compute_predict: predict = np.zeros([n_splits, n_samples, n_dims, n_dims_tg]) ** np.nan predictg = np.zeros([n_splits, n_samplesg, n_dimsg, n_dimsg_tg, n_folds]) ** np.nan else: predict = [] predictg = [] if compute_probas: probas = np.zeros([n_splits, n_samples, n_dims, n_dims_tg, n_classes]) ** np.nan probasg = np.zeros([n_splits, n_samplesg, n_dimsg, n_dimsg_tg, n_classes, n_folds]) ** np.nan
k = 10 records = data[:,1:] labels = data[:,0] n_train = 35000 #n_val = n - n_train n_val = 7000 trainset = records[:n_train,:] trainlabels = labels[:n_train] #valset = records[n_train:,:] #vallabels = labels[n_train:,:] valset = records[n_train:n_train+n_val,:] vallabels = labels[n_train:n_train+n_val] n,dim = trainset.shape # mean centering, stdev normalization and whitening scaler = Scaler() scaler.fit(trainset) trainset = scaler.transform(trainset) valset = scaler.transform(valset) pca = PCA(n_components=dim,whiten=True) pca.fit(trainset) trainset = pca.transform(trainset) valset = pca.transform(valset) config = Train_config() config.iterations = 10 config.nonlinearity = 'tanh' config.batchsize = 50 config.learning_rate = 0.2 config.momentum = 0.7 log = open('log.txt','w')
def main(): X =[] Y=[] featuresDB = Base(os.getcwd()+"\\Databases\\features.db") featuresDB.open() print "features open" for rec in featuresDB: vec = [] vec.append(rec.f1) vec.append(rec.f3) vec.append(rec.f4) vec.append(rec.f5) vec.append(rec.f6) vec.append(rec.f7) vec.append(rec.f10) vec.append(rec.f11) vec.append(rec.f12) vec.append(rec.f13) vec.append(rec.f14) vec.append(rec.f15) vec.append(rec.f16) vec.append(rec.f17) vec.append(rec.f18) vec.append(rec.f19) vec.append(rec.f20) vec.append(rec.f21) vec.append(rec.f22) vec.append(rec.f23) X.append(vec) Y.append(rec.score) print "building classifier" Y = np.array(Y) ybar = Y.mean() for i in range(len(Y)): if Y[i]<ybar: Y[i]=1 else: Y[i]=2 scaler = Scaler().fit(X) X = scaler.transform(X) X= np.array(X) Y=np.array(Y) skf = cross_validation.StratifiedKFold(Y,k=2) for train, test in skf: X_train, X_test = X[train], X[test] y_train, y_test = Y[train], Y[test] clf = ExtraTreesClassifier(n_estimators=8,max_depth=None,min_split=1,random_state=0,compute_importances=True) scores = cross_validation.cross_val_score(clf,X_train,y_train,cv=5) clf.fit_transform(X_train,y_train) print "Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) print clf.feature_importances_ y_pred =clf.predict(X_test) print classification_report(y_test,y_pred) model=(scaler,clf) joblib.dump(model,'AestheticModel\\aestheticModel.pkl') print "Done"
from datetime import datetime,timedelta from errorcurves import ErrorCurves import numpy as np from sklearn import mixture import pandas df = pandas.read_csv('TrainingDataset.csv') df_test = pandas.read_csv('TestDataset.csv') ids = df_test.pop('id') outcomes = list() train_sets = list() quants = [i for i in df.columns if 'Q' in i] df_quants = df[quants] scaler = Scaler() scaled = scaler.fit_transform(df_quants.fillna(0)) dpgmm = mixture.DPGMM(n_components = 75) dpgmm.fit(scaled) clusters = dpgmm.predict(scaled) df['clusters'] = clusters # Parse dates jan1 = datetime(2000,1,1) # Drop all rows where response variable == NaN for i in range(1,13): df_i = df[df['Outcome_M'+str(i)]>0] outcomes.append(df_i.pop('Outcome_M'+str(i))) [df_i.pop(i) for i in df_i.columns if 'Out' in i]
def load_kernels( dataset, tr_norms=['std', 'sqrt', 'L2'], te_norms=['std', 'sqrt', 'L2'], analytical_fim=False, pi_derivatives=False, sqrt_nr_descs=False, only_train=False, verbose=0, do_plot=False, outfile=None): tr_outfile = outfile % "train" if outfile is not None else outfile # Load sufficient statistics. samples, _ = dataset.get_data('train') tr_data, tr_counts, tr_labels = load_video_data( dataset, samples, outfile=tr_outfile, analytical_fim=analytical_fim, pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose) if verbose > 0: print "Train data: %dx%d" % tr_data.shape if do_plot: plot_fisher_vector(tr_data[0], 'before') scalers = [] for norm in tr_norms: if norm == 'std': scaler = Scaler() tr_data = scaler.fit_transform(tr_data) scalers.append(scaler) elif norm == 'sqrt': tr_data = power_normalize(tr_data, 0.5) elif norm == 'sqrt_cnt': tr_data = approximate_signed_sqrt( tr_data, tr_counts, pi_derivatives=pi_derivatives) elif norm == 'L2': tr_data = L2_normalize(tr_data) if do_plot: plot_fisher_vector(tr_data[0], 'after_%s' % norm) tr_kernel = np.dot(tr_data, tr_data.T) if only_train: return tr_kernel, tr_labels, scalers, tr_data te_outfile = outfile % "test" if outfile is not None else outfile # Load sufficient statistics. samples, _ = dataset.get_data('test') te_data, te_counts, te_labels = load_video_data( dataset, samples, outfile=te_outfile, analytical_fim=analytical_fim, pi_derivatives=pi_derivatives, sqrt_nr_descs=sqrt_nr_descs, verbose=verbose) if verbose > 0: print "Test data: %dx%d" % te_data.shape ii = 0 for norm in te_norms: if norm == 'std': te_data = scalers[ii].transform(te_data) ii += 1 elif norm == 'sqrt': te_data = power_normalize(te_data, 0.5) elif norm == 'sqrt_cnt': te_data = approximate_signed_sqrt( te_data, te_counts, pi_derivatives=pi_derivatives) elif norm == 'L2': te_data = L2_normalize(te_data) te_kernel = np.dot(te_data, tr_data.T) return tr_kernel, tr_labels, te_kernel, te_labels
from sklearn.svm import SVC from sklearn.preprocessing import Scaler from sklearn.datasets import load_iris from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV iris_dataset = load_iris() X, Y = iris_dataset.data, iris_dataset.target # It is usually a good idea to scale the data for SVM training. # We are cheating a bit in this example in scaling all of the data, # instead of fitting the transformation on the trainingset and # just applying it on the test set. scaler = Scaler() X = scaler.fit_transform(X) # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. C_range = 10. ** np.arange(-5, 5) gamma_range = 10. ** np.arange(-5, 5) param_grid = dict(gamma=gamma_range, C=C_range) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=Y, k=5)) grid.fit(X, Y)
if folding == 'stratified': cv = StratifiedKFold(y, k=n_folds) elif folding == 'kfolding': cv = KFold(n=y.shape[0], k=n_folds) elif folding == 'leaveoneout': n_folds[0] = y.shape[0] cv = LeaveOneOut(n=y.shape[0]) else: print("unknown crossvalidation method!") #-- classifier clf = svm.SVC(kernel='linear', probability=True, C=svm_C) #-- normalizer scaler = Scaler() #-- feature selection fs = SelectPercentile(f_classif, percentile=fs_n) #-- grid search #parameters = {'svm__C': (1e-6,1e-3, 1e-1, .4)} #clf = GridSearchCV(svm, parameters,n_jobs=1) #-- initialize results predict = np.zeros([n_splits, n_samples, n_dims]) ** np.nan probas = np.zeros([n_splits, n_samples, n_dims, n_classes]) ** np.nan predictg = np.zeros([n_splits, n_samplesg, n_dimsg, n_folds]) ** np.nan probasg = np.zeros([n_splits, n_samplesg, n_dimsg, n_classes, n_folds]) ** np.nan coef = np.empty([n_splits, n_folds, n_dims, n_classes * (n_classes - 1) / 2, n_features]) ** 0 all_folds = np.zeros([n_splits, n_folds, n_samples]) ** np.nan y_shfl = np.copy(y)
iris = load_iris() X = iris.data Y = iris.target # dataset for decision function visualization X_2d = X[:, :2] X_2d = X_2d[Y > 0] Y_2d = Y[Y > 0] Y_2d -= 1 # It is usually a good idea to scale the data for SVM training. # We are cheating a bit in this example in scaling all of the data, # instead of fitting the transformation on the training set and # just applying it on the test set. scaler = Scaler() X = scaler.fit_transform(X) X_2d = scaler.fit_transform(X_2d) ############################################################################## # Train classifier # # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. C_range = 10.0 ** np.arange(-2, 9) gamma_range = 10.0 ** np.arange(-5, 4) param_grid = dict(gamma=gamma_range, C=C_range)