Ejemplo n.º 1
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Ejemplo n.º 2
0
 def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     
     ks = []
     for i in range(1000):
         ##
         ## Random Projection
         ##
         rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
         rp.fit(X_train_scl)
         X_train_rp = rp.transform(X_train_scl)
         
         ks.append(kurtosis(X_train_rp))
         
     mean_k = np.mean(ks, 0)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     title = 'Kurtosis (Randomized Projection) for ' + data_set_name
     name = data_set_name.lower() + '_rp_kurt'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1),
                        mean_k,
                        np.arange(1, len(mean_k)+1, 1).astype('str'),
                        'Feature Index',
                        'Kurtosis',
                        title,
                        filename)
Ejemplo n.º 3
0
    def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)
        
        ##
        ## ICA
        ##
        ica = FastICA(n_components=X_train_scl.shape[1])
        X_ica = ica.fit_transform(X_train_scl)
        
        ##
        ## Plots
        ##
        ph = plot_helper()

        kurt = kurtosis(X_ica)
        print(kurt)
        
        title = 'Kurtosis (FastICA) for ' + data_set_name
        name = data_set_name.lower() + '_ica_kurt'
        filename = './' + self.out_dir + '/' + name + '.png'
        
        ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1),
                           kurt,
                           np.arange(1, len(kurt)+1, 1).astype('str'),
                           'Feature Index',
                           'Kurtosis',
                           title,
                           filename)
Ejemplo n.º 4
0
 def best_ica_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/wine_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Ejemplo n.º 5
0
def scale_feature_matrix(feature_M, linear=False, outliers=False):
    from sklearn.preprocessing import StandardScaler, RobustScaler
    import numpy as np
    
    binary_fields = [col for col in feature_M.columns if len(set(feature_M[col])) == 2]
            
    if outliers:
        #Scaling 0 median & unit variance
        scaler_obj = RobustScaler()
        print 'centering around median'

    else:
        #Scale 0 mean & unit variance
        scaler_obj = StandardScaler()
        print 'centering around mean'
    
    print 'found these binaries'
    print '-' * 10
    print '\n'.join(binary_fields)

        
    X_scaled = scaler_obj.fit_transform(feature_M.drop(binary_fields, axis=1))
    X_scaled_w_cats = np.c_[X_scaled, feature_M[binary_fields].as_matrix()]
    
    return X_scaled_w_cats, scaler_obj
Ejemplo n.º 6
0
 def nn_wine_orig(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
Ejemplo n.º 7
0
def standardize_columns(data):
    """
    We decided to standardize the weather factor due to outliers.
    """
    columns_to_standardize = ['temp', 'atemp', 'humidity', 'windspeed']
    min_max_scaler = RobustScaler()

    for column in columns_to_standardize:
        data[column] = min_max_scaler.fit_transform(data[column])
    return data
Ejemplo n.º 8
0
 def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     scores = []
     train_scores = []
     rng = range(1, X_train_scl.shape[1]+1)
     for i in rng:
         lda = LinearDiscriminantAnalysis(n_components=i)
         cv = KFold(X_train_scl.shape[0], 3, shuffle=True)
         
         # cross validation
         cv_scores = []
         for (train, test) in cv:
             lda.fit(X_train_scl[train], y_train[train])
             score = lda.score(X_train_scl[test], y_train[test])
             cv_scores.append(score)
         
         mean_score = np.mean(cv_scores)
         scores.append(mean_score)
         
         # train score
         lda = LinearDiscriminantAnalysis(n_components=i)
         lda.fit(X_train_scl, y_train)
         train_score = lda.score(X_train_scl, y_train)
         train_scores.append(train_score)
         
         print(i, mean_score)
         
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (LDA) for ' + data_set_name
     name = data_set_name.lower() + '_lda_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(rng,
                    [scores, train_scores],
                    [None, None],
                    ['cross validation score', 'training score'],
                    cm.viridis(np.linspace(0, 1, 2)),
                    ['o', '*'],
                    title,
                    'n_components',
                    'Score',
                    filename)
Ejemplo n.º 9
0
def demensionReduction(numFeatures,cateFeatures):
    """

    :param numFeatures:
    :param labels:
    :return:
    """
    scaler = RobustScaler()
    scaledFeatures = scaler.fit_transform(numFeatures)
    pca = PCA(n_components=5)
    reducedFeatures = pca.fit_transform(scaledFeatures)
    allFeatures = np.concatenate((reducedFeatures,cateFeatures),axis=1)
    return allFeatures
Ejemplo n.º 10
0
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Ejemplo n.º 11
0
def transform_dataframe(dataframe):

    """
    Function to read dataframe and standardize the dataframe with
    a mean 0 and unit variance on every column

    Parameters:
        dataframe : Input pandas dataframe
    Input types: pd.Dataframe
    Output types: pd.Dataframe

    """
    cols = [col for col in dataframe.columns]
    robust_scaler = RobustScaler()
    df = robust_scaler.fit_transform(dataframe[cols])
    dataframe.columns = df
    return dataframe
Ejemplo n.º 12
0
    def scale(self,columns,categorical_cols,apply_list,target_column):
        from sklearn.preprocessing import RobustScaler
        scaler = RobustScaler()

        if apply_list:
            numerical_cols = columns
        else:
            numerical_cols = []
            for col in self.dataset.columns.values:
                if col not in categorical_cols:
                    numerical_cols.append(col)
                else:
                    pass
        # We don't want to scale the target variable, as it is already binary.
        # The target column uses the same value as target_value from Split Data section
        # in the settings popup.
        numerical_cols.remove(target_column)
        # Scale, fit and transform all the numerical columns
        scaled_data = scaler.fit_transform(self.dataset[numerical_cols])
        self.dataset[numerical_cols] = scaled_data
        return self.dataset
Ejemplo n.º 13
0
def detect_bad_channels(inst, pick_types=None, threshold=.2):
    from sklearn.preprocessing import RobustScaler
    from sklearn.covariance import EmpiricalCovariance
    from jr.stats import median_abs_deviation
    if pick_types is None:
        pick_types = dict(meg='mag')
    inst = inst.pick_types(copy=True, **pick_types)
    cov = EmpiricalCovariance()
    cov.fit(inst._data.T)
    cov = cov.covariance_
    # center
    scaler = RobustScaler()
    cov = scaler.fit_transform(cov).T
    cov /= median_abs_deviation(cov)
    cov -= np.median(cov)
    # compute robust summary metrics
    mu = np.median(cov, axis=0)
    sigma = median_abs_deviation(cov, axis=0)
    mu /= median_abs_deviation(mu)
    sigma /= median_abs_deviation(sigma)
    distance = np.sqrt(mu ** 2 + sigma ** 2)
    bad = np.where(distance < threshold)[0]
    bad = [inst.ch_names[ch] for ch in bad]
    return bad
Ejemplo n.º 14
0
 def best_pca_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     pca = PCA(n_components=3)
     X_train_transformed = pca.fit_transform(X_train_scl, y_train)
     X_test_transformed = pca.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_pca_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Ejemplo n.º 15
0
 def best_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     lda = LinearDiscriminantAnalysis(n_components=2)
     X_train_transformed = lda.fit_transform(X_train_scl, y_train)
     X_test_transformed = lda.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/nba_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Ejemplo n.º 16
0
        if end_index > nr:
            end_index = nr
        if start_index > nr:
            end_index = nr+1

	test_size = 0.20
	if pf is True:
		test_size = 0.05
    #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
    
    X_test = X[0:nr_test]
    Y_test = Y[0:nr_test]
    X_train = X[nr_test+1:len(X)]
    Y_train = Y[nr_test+1:len(X)]
     
    X_train = robust_scaler.fit_transform(X_train)
    
    # save standard scaler
    joblib.dump(robust_scaler, base_path + 'data/rs-' + algorithm + '-' + str(ps[psi]) + '.pkl')    
    
    X_test = robust_scaler.transform(X_test)
    
    if algorithm == 'kernel-approx':
        rbf_feature = RBFSampler(gamma=1, random_state=1)
        X_train = rbf_feature.fit_transform(X_train)
        X_test = rbf_feature.fit_transform(X_test)
    elif algorithm == 'mlp':
    	n_output = len(set(Y))
    	#n_output = 2460
    	n_input = len(X_train[0]) + 1
    	n_neurons = int(round(sqrt(n_input*n_output)))
Ejemplo n.º 17
0
 def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     em_bic = []
     em_aic = []
     em_completeness_score = []
     em_homogeneity_score = []
     em_measure_score = []
     em_adjusted_rand_score = []
     em_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         
         ##
         ## Expectation Maximization
         ##
         em = GaussianMixture(n_components=k, covariance_type='full')
         em.fit(X_train_scl)
         em_pred = em.predict(X_train_scl)
         
         em_bic.append(em.bic(X_train_scl))
         em_aic.append(em.aic(X_train_scl))        
     
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred))
         em_completeness_score.append(completeness_score(y_train_score, em_pred))
         em_measure_score.append(v_measure_score(y_train_score, em_pred))
         em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred))
         em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred))
         
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## BIC/AIC Plot
     ##
     title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_series(cluster_range,
                 [em_bic, em_aic],
                 [None, None],
                 ['bic', 'aic'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', '*'],
                 title,
                 'Number of Clusters',
                 'Information Criterion',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
print(df['Amount'].cummin())



original_df = df
# print(df.head(5))

df = df.sample(frac=1, random_state=rand_state)





#SCALING DATA
rob_scaler = RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)


scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

#The two features Time and Amount are scaled

Ejemplo n.º 19
0
train["SalePrice"] = np.log1p(train["SalePrice"])

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

all_data = pandas.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

#log transform skewed numeric features:
skewness = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
left_skewed_feats = skewness[skewness > 0.5].index
right_skewed_feats = skewness[skewness < -0.5].index
all_data[left_skewed_feats] = np.log1p(all_data[left_skewed_feats])
#all_data[right_skewed_feats] = np.exp(all_data[right_skewed_feats])

scaler = RobustScaler()
all_data[numeric_feats] = scaler.fit_transform(all_data[numeric_feats])

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train['SalePrice']

linear_model = ElasticNet(alpha=0.001)
linear_model.fit(X_train, y)

svr_model = SVR(kernel='rbf', C=2, epsilon=0.05)
svr_model.fit(X_train, y)

test['SalePrice'] = np.expm1(
    (linear_model.predict(X_test) + svr_model.predict(X_test)) / 2.0)

test.to_csv('kaggle-houses-submission.csv',
Ejemplo n.º 20
0
    def pca_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)
        
        ##
        ## PCA
        ##
        pca = PCA(n_components=X_train_scl.shape[1], svd_solver='full')
        X_pca = pca.fit_transform(X_train_scl)
        
        ##
        ## Plots
        ##
        ph = plot_helper()
        
        ##
        ## Explained Variance Plot
        ##
        title = 'Explained Variance (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_evar_err'
        filename = './' + self.out_dir + '/' + name + '.png'        
        self.plot_explained_variance(pca, title, filename)

        ##
        ## Reconstruction Error
        ##
        all_mses, rng = self.reconstruction_error(X_train_scl, PCA)
        
        title = 'Reconstruction Error (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_rec_err'
        filename = './' + self.out_dir + '/' + name + '.png'
        ph.plot_series(rng,
                    [all_mses.mean(0)],
                    [all_mses.std(0)],
                    ['mse'],
                    ['red'],
                    ['o'],
                    title,
                    'Number of Features',
                    'Mean Squared Error',
                    filename)
        
        
        ##
        ## Manually compute eigenvalues
        ## 
        cov_mat = np.cov(X_train_scl.T)
        eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
        print(eigen_values)
        sorted_eigen_values = sorted(eigen_values, reverse=True)

        title = 'Eigen Values (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_eigen'
        filename = './' + self.out_dir + '/' + name + '.png'
        
        ph.plot_simple_bar(np.arange(1, len(sorted_eigen_values)+1, 1),
                           sorted_eigen_values,
                           np.arange(1, len(sorted_eigen_values)+1, 1).astype('str'),
                           'Principal Components',
                           'Eigenvalue',
                           title,
                           filename)
        
        ## TODO Factor this out to new method
        ##
        ## Scatter
        ##
        '''
Ejemplo n.º 21
0
def robust_scaler(data):
    scaler = RobustScaler()
    data = scaler.fit_transform(data)

    return data
Ejemplo n.º 22
0
test['Date'] = test['Date'].astype('datetime64[D]')
add_datepart(test, 'Date')


# recombine tfidf values with other data
train_y = np.array(train['Label'].values)
test_y = np.array(test['Label'].values)

temp_train = train.drop(['Label', 'Text'], axis=1)
train_x = np.append(train_dense, temp_train.values, axis=1)
temp_test = test.drop(['Label', 'Text'], axis=1)
test_x = np.append(test_dense, temp_test.values, axis=1)
feature_names = list(temp_train.columns)
feature_names.extend(words)

# scale features and labels to gaussian distribution
scaler = RobustScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
train_y = scaler.fit_transform(train_y.reshape(-1, 1))
test_y = scaler.transform(test_y.reshape(-1, 1))

final_train = pd.DataFrame(train_x, columns=feature_names)
final_train['##Label##'] = train_y
final_train = final_train.astype('float16')
final_train.to_hdf('Data/final_busfin_train_' + years + '.h5', key='train')
final_test = pd.DataFrame(test_x, columns=feature_names)
final_test['##Label##'] = test_y
final_test = final_test.astype('float16')
final_test.to_hdf('Data/final_busfin_test_' + years + '.h5', key='test')
Ejemplo n.º 23
0
    overfit = list(overfit)
    return overfit


overfitted_features = overfit_reducer(X)

X.drop(overfitted_features, axis=1, inplace=True)
test.drop(overfitted_features, axis=1, inplace=True)
print('X.shape', X.shape)
print('test.shape', test.shape)

std_scaler = StandardScaler()
rbst_scaler = RobustScaler()
power_transformer = PowerTransformer()
X_std = std_scaler.fit_transform(X)
X_rbst = rbst_scaler.fit_transform(X)
X_pwr = power_transformer.fit_transform(X)

test_std = std_scaler.transform(test)
test_rbst = rbst_scaler.transform(test)
test_pwr = power_transformer.transform(test)

X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                    y,
                                                    test_size=0.002,
                                                    random_state=52)
print('X_train Shape :', X_train.shape)
print('X_test Shape :', X_test.shape)
print('y_train Shape :', y_train.shape)
print('y_test Shape :', y_test.shape)
Ejemplo n.º 24
0
 def kmeans_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='K-Means'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     km_inertias = []
     km_completeness_score = []
     km_homogeneity_score = []
     km_measure_score = []
     km_adjusted_rand_score = []
     km_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         ##
         ## KMeans
         ##
         km = KMeans(n_clusters=k, algorithm='full', n_jobs=-1)
         km.fit(X_train_scl)
         
         # inertia is the sum of distances from each point to its center   
         km_inertias.append(km.inertia_)
         
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         km_homogeneity_score.append(homogeneity_score(y_train_score, km.labels_))
         km_completeness_score.append(completeness_score(y_train_score, km.labels_))
         km_measure_score.append(v_measure_score(y_train_score, km.labels_))
         km_adjusted_rand_score.append(adjusted_rand_score(y_train_score, km.labels_))
         km_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, km.labels_))
         
         ##
         ## Silhouette Plot
         ##
         title = 'Silhouette Plot (' + analysis_name + ', k=' + str(k) + ') for ' + data_set_name
         name = data_set_name.lower() + '_' + analysis_name.lower() + '_silhouette_' + str(k)
         filename = './' + self.out_dir + '/' + name + '.png'
         
         self.silhouette_plot(X_train_scl, km.labels_, title, filename)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## Elbow Plot
     ##
     title = 'Elbow Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_elbow'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     # line to help visualize the elbow
     lin = ph.extended_line_from_first_two_points(km_inertias, 0, 2)
     
     ph.plot_series(cluster_range,
                 [km_inertias, lin],
                 [None, None],
                 ['inertia', 'projected'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', ''],
                 title,
                 'Number of Clusters',
                 'Inertia',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [km_homogeneity_score, km_completeness_score, km_measure_score, km_adjusted_rand_score, km_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
Ejemplo n.º 25
0
def least_square_reference(
    inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None
):
    """
    Fits and applies Least Square projection of the reference channels
    (potentially from an empty room) and removes the corresponding component
    from the recordings of a subject.

    Parameters
    ----------
        inst : Raw | str
            Raw instance or path to raw data.
        empty_room : str | None
            Path to raw data acquired in empty room.
        max_times_samples : int
            Number of time sample to use for pinv. Defautls to 2000
        bad_channels : list | array, shape (n_chans) of strings
            Lists bad channels
        scaler : function | None
            Scaler functions to normalize data. Defaults to
            sklearn.preprocessing.RobustScaler.

    Returns
    -------
        inst : Raw

    adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m

    Main EHN
        - Automatically detects channel types.
        - Allows flexible scaler; Robust by default.
        - The data is projected back in Tesla.
        - Allows memory control.
    TODO:
        - Allow other kind of MNE-Python inst
        - Allow baseline selection (pre-stim instead of empty room)
        - Clean up memory
        - Allow fancy solver (l1, etc)
    """
    from scipy.linalg import pinv
    from mne.io import read_raw_kit
    from mne.io import _BaseRaw

    # Least square can be fitted on empty room or on subject's data
    if empty_room is None:
        if not isinstance(inst, _BaseRaw):
            raw = read_raw_kit(inst, preload=True)
        else:
            raw = inst
    else:
        if not isinstance(empty_room, _BaseRaw):
            raw = read_raw_kit(empty_room, preload=True)
        else:
            raw = empty_room

    # Parameters
    n_chans, n_times = raw._data.shape
    chan_info = raw.info["chs"]

    # KIT: axial gradiometers (equiv to mag)
    ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0]
    # KIT: ref magnetometer
    ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0]
    # Other channels
    ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0]
    # Bad channel
    ch_bad = np.empty(0)
    if (bad_channels is not None) and len(bad_channels):
        if np.all([isinstance(ch, int) for ch in bad_channels]):
            bad_channels = np.array(bad_channels)
        elif np.all([isinstance(ch, str) for ch in bad_channels]):
            bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels]
        else:
            raise ValueError("bad_channels needs array of int or array of str")
    else:
        bad_channels = []
    default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]]
    bad_channels = np.array(default_bad_channels + bad_channels, int)

    print("bad channels:", [raw.ch_names[bad] for bad in bad_channels])
    # To avoid memory error, let's subsample across time
    sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples)))

    # Whiten data
    if scaler is None:
        from sklearn.preprocessing import RobustScaler

        scaler = RobustScaler()
    data_bsl = scaler.fit_transform(raw._data.T)

    # Fit Least Square coefficients on baseline data
    empty_sensors = data_bsl[:, ch_mag]
    if len(ch_bad):
        empty_sensors[:, ch_bad] = 0  # remove bad channels
    coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :])
    empty_sensors, data_bsl = None, None  # clear memory

    # Apply correction on subject data
    if empty_room is not None:
        del raw
        raw = read_raw_kit(inst, preload=True)

    data_subject = scaler.transform(raw._data.T)
    subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs)

    # Remove bad channels
    if len(ch_bad):
        subject_sensors[:, ch_bad] = 0

    # Reproject baseline
    new_ref = np.dot(subject_sensors, pinv(coefs))

    # Un-whiten data to get physical units back
    data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1)
    data = scaler.inverse_transform(data)

    # Output
    raw._data = data.T
    return raw
Ejemplo n.º 26
0
from sklearn.svm import SVR
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler


path = "/Users/xiaofeifei/I/Kaggle/Benz/"
train = pd.read_csv(path+'train_start.csv')
# test = pd.read_csv(path+'test_start.csv')

y = train["y"]

train = train.drop(["y"], axis = 1)

# # poly
svm = SVR(kernel='rbf', C=1.0, epsilon=0.05)

a= RobustScaler()
train = a.fit_transform(train,y)
kr = GridSearchCV(SVR(kernel='rbf', C=1.0, epsilon=0.05), cv=5, n_jobs = 6,verbose=1,scoring='r2',
                  param_grid={"C": [20,30],
                              "epsilon": [0.02,0.03,0.05,0.07]})

kr.fit(train, y)
print kr.best_params_
print kr.best_score_
print kr.best_estimator_

# {'epsilon': 0.01, 'C': 30}
# 0.536811148843
X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints)
X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints)
Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints])
X_test = np.vstack([X1, X2])

X_train[0, 0] = -1000  # a fairly large outlier


# Scale data
standard_scaler = StandardScaler()
Xtr_s = standard_scaler.fit_transform(X_train)
Xte_s = standard_scaler.transform(X_test)

robust_scaler = RobustScaler()
Xtr_r = robust_scaler.fit_transform(X_train)
Xte_r = robust_scaler.fit_transform(X_test)


# Plot data
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(X_train[:, 0], X_train[:, 1],
              color=np.where(Y_train > 0, 'r', 'b'))
ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[0].set_title("Unscaled data")
ax[1].set_title("After standard scaling (zoomed in)")
ax[2].set_title("After robust scaling (zoomed in)")
# for the scaled data, we zoom in to the data center (outlier can't be seen!)
for a in ax[1:]:
    a.set_xlim(-3, 3)
Ejemplo n.º 28
0
def choose_geometries(list_of_molecules,
                      features='fingerprint',
                      maximum_number_of_seeds=8):
    if len(list_of_molecules) < 2:
        cluster_logger.info(
            "    Not enough data to cluster (only %d), returning original" %
            len(list_of_molecules))
        return list_of_molecules

    if len(list_of_molecules) <= maximum_number_of_seeds:
        cluster_logger.info('    Not enough data for clustering. '
                            '    Removing similar geometries from the list')
        return remove_similar(list_of_molecules)

    cluster_logger.info('Clustering on {} geometries'.format(
        len(list_of_molecules)))

    if features == 'fingerprint':
        dt = [
            pyar.representations.fingerprint(i.atoms_list, i.coordinates)
            for i in list_of_molecules
        ]
    elif features == 'scm':
        dt = [
            pyar.representations.sorted_coulomb_matrix(
                pyar.representations.coulomb_matrix(i.atoms_list,
                                                    i.coordinates))
            for i in list_of_molecules
        ]
    elif features == 'moi':
        dt = [
            pyar.property.get_principal_axes(i.moments_of_inertia_tensor)
            for i in list_of_molecules
        ]
    elif features == 'rsmd':
        dt = [
            pyar.representations.get_rsmd(i.moments_of_inertia_tensor)
            for i in list_of_molecules
        ]
    else:
        cluster_logger.error('This feature is not implemented')
        return list_of_molecules

    dt = np.around(dt, decimals=5)

    df = pd.DataFrame(dt)
    df.to_csv("features.csv")

    scale_it = RobustScaler()
    dt = scale_it.fit_transform(dt)

    try:
        labels = generate_labels(dt)
    except Exception as e:
        cluster_logger.exception("All Clustering algorithms failed")
        cluster_logger.exception(e)
        return list_of_molecules

    best_from_each_cluster = select_best_from_each_cluster(
        labels, list_of_molecules)

    if len(best_from_each_cluster) == 1:
        return best_from_each_cluster
    else:
        cluster_logger.info("    Removing similar molecules after clustering.")
        reduced_best_from_each_cluster = remove_similar(best_from_each_cluster)

    if len(reduced_best_from_each_cluster) > maximum_number_of_seeds:
        return choose_geometries(
            reduced_best_from_each_cluster,
            maximum_number_of_seeds=maximum_number_of_seeds)
    else:
        return reduced_best_from_each_cluster
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt'
dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt'
train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt'



trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train)
trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy)
evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev)
evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly)

evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest)
evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2)


robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(evalx)

clf= LinearDiscriminantAnalysis() #
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV)

evalx2=robust_scaler.transform(evalx2)
predictValue=clf.predict(evalx2)


print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
Ejemplo n.º 30
0
plt.subplot(3, 2, 2)
scaler1 = MinMaxScaler()
X_new = scaler1.fit_transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1])

plt.subplot(3, 2, 3)
scaler2 = MaxAbsScaler()
X_new2 = scaler2.fit_transform(X)
plt.scatter(X_new2[:, 0], X_new2[:, 1])

plt.subplot(3, 2, 4)
scaler3 = StandardScaler()
X_new3 = scaler3.fit_transform(X)
plt.scatter(X_new3[:, 0], X_new3[:, 1])
plt.xlim(-2, 2)
plt.ylim(-2, 2)

plt.subplot(3, 2, 5)
scaler4 = RobustScaler()
X_new4 = scaler4.fit_transform(X)
plt.scatter(X_new4[:, 0], X_new4[:, 1])
plt.xlim(-1, 1)
plt.ylim(-1, 1)

plt.subplot(3, 2, 6)
scaler5 = Normalizer()
X_new5 = scaler5.fit_transform(X)
plt.scatter(X_new5[:, 0], X_new5[:, 1])
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()
Ejemplo n.º 31
0
    with open('Metro_Interstate_Traffic_Volume.csv.gz', mode='wb') as file:
        file.write(req.content)

x_scaler = StandardScaler()
y_scaler = RobustScaler(quantile_range=(.1, .9))

df = pd.read_csv('Metro_Interstate_Traffic_Volume.csv.gz',
                 compression='gzip',
                 parse_dates=['date_time'])
df = pd.concat((df, parse_date(df['date_time'])), axis=1)

x_train = x_scaler.fit_transform(df[[
    'year', 'month_x', 'month_y', 'weekday_x', 'weekday_y', 'day_x', 'day_y',
    'hour_x', 'hour_y'
]])
y_train = y_scaler.fit_transform(df[['traffic_volume']])
x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                    y_train,
                                                    test_size=.1,
                                                    shuffle=False)

x_train = torch.tensor(x_train).float().to(device=device)
x_test = torch.tensor(x_test).float().to(device=device)
y_train = torch.tensor(y_train).float().to(device=device)
y_train = y_train.view((-1, 1))

net = MDN(x_train.shape[1]).to(device=device)
net.zero_grad()

learning_rate = 1e-3
epochs = 5000
Ejemplo n.º 32
0
colors = ["red","green"]
mapper = CategoricalColorMapper(factors = factors,palette = colors)
p.circle('suicides_no', 'population', size=4, source=source,
    legend='sex', fill_alpha=0.2, color = {"field":"sex","transform":mapper})

show(p)'''
#https://pythonspot.com/3d-scatterplot/
#https://matplotlib.org/gallery/mplot3d/scatter3d.html
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from sklearn.preprocessing import RobustScaler
#https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
rob_scaler = RobustScaler()

wh['scaled_population'] = rob_scaler.fit_transform(
    wh['population'].values.reshape(-1, 1))
wh['scaled_suicides_no'] = rob_scaler.fit_transform(
    wh['suicides_no'].values.reshape(-1, 1))

# Create plot
fig = plt.figure()
#ax = fig.add_subplot(1, 1, 1)
#ax = fig.gca(projection='3d')
ax = fig.add_subplot(111, projection='3d')
#ax.set_xscale('log')
wh = wh[wh.suicides_no > 0]
ax.scatter(wh[wh.sex == 'female'].scaled_population,
           wh[wh.sex == 'female'].year,
           wh[wh.sex == 'female'].scaled_suicides_no,
           alpha=0.2,
           c="red",
    plt.show()

    # Scale data using a Robust Scaler
    fig, ax = plt.subplots(2, 2, figsize=(8, 8))

    ax[0, 0].scatter(data[:, 0], data[:, 1])
    ax[0, 0].set_xlim([-10, 10])
    ax[0, 0].set_ylim([-10, 10])
    ax[0, 0].grid()
    ax[0, 0].set_xlabel('X')
    ax[0, 0].set_ylabel('Y')
    ax[0, 0].set_title('Raw data')

    rs = RobustScaler(quantile_range=(15, 85))
    scaled_data = rs.fit_transform(data)

    ax[0, 1].scatter(scaled_data[:, 0], scaled_data[:, 1])
    ax[0, 1].set_xlim([-10, 10])
    ax[0, 1].set_ylim([-10, 10])
    ax[0, 1].grid()
    ax[0, 1].set_xlabel('X')
    ax[0, 1].set_ylabel('Y')
    ax[0, 1].set_title('Scaled data (15% - 85%)')

    rs1 = RobustScaler(quantile_range=(25, 75))
    scaled_data1 = rs1.fit_transform(data)

    ax[1, 0].scatter(scaled_data1[:, 0], scaled_data1[:, 1])
    ax[1, 0].set_xlim([-10, 10])
    ax[1, 0].set_ylim([-10, 10])
        "hematocrit": row["hematocrit_apache"],
        "wbc": row["wbc_apache"],
    }
    return np.sum([calculate_single_scores(v,k) for k,v in cols.items()])


df["apacheScore"] = df.apply(getAPACHEScore , axis=1)

df["apacheScore"].describe()

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer

rs = RobustScaler()
pt = PowerTransformer()
df.loc[:,numeric_cols] = rs.fit_transform(df.loc[:,numeric_cols])
df.loc[:,numeric_cols] = pt.fit_transform(df.loc[:,numeric_cols])

ndf = df.copy()

cat_cols_minus = [c for c in cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]]
cat_cols_minus_useless = [c for c in cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]]
#df
#pcadf = pcadf.join(df[cat_cols_minus_useless])

#ndf = pcadf

cols_to_dummy = [c for c in cat_cols_minus_useless if c != "hospital_death"]
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False , handle_unknown='ignore')
endcodedNdf = ohe.fit_transform(ndf.loc[:,cols_to_dummy])
Ejemplo n.º 35
0
# Feature scaling to get more accurate representation and better learning performance
'''
Most machine learning algorithms take into account only the magnitude of the measurements, not the units of those measurements.
The feature with a very high magnitude (number) may affect the prediction a lot more than an equally important feature.
e.g. the AGE (within certain fixed range) and the PAY_AMTn (monetary) features have very different ranges of values

RobustScaler:
The Robust Scaler uses statistics that are robust to outliers.
This usage of interquartiles means that they focus on the parts where the bulk of the data is.
This makes them very suitable for working with outliers.
Notice that after Robust scaling, the distributions are brought into the same scale and overlap, but the outliers remain outside of bulk of the new distributions.
'''
x = df.drop('default', axis=1)
robust_scaler = RobustScaler()
x = robust_scaler.fit_transform(x)  # rescale all the features to a same range
y = df['default']
# stratify parameter makes data split in a stratified fashion meaning the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=123,
                                                    stratify=y)

# In[ ]:


def c_matrix(CM, labels=['pay', 'default']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name = 'TRUE'
    df.columns.name = 'PREDICTION'
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
######################



########KNN
# dividindo o dataset em test e treino
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xnegativo, ynegativo, test_size = 0.30)

#Feature Scaling
from sklearn.preprocessing import RobustScaler
sc = RobustScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=11)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#acuracia
from sklearn.metrics import accuracy_score
Ejemplo n.º 37
0
    def go(self, all_data, cols, colsP):
        train = all_data.loc[(all_data.SalePrice > 0),
                             cols].reset_index(drop=True, inplace=False)
        y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index(
            drop=True, inplace=False)
        test = all_data.loc[(all_data.SalePrice == 0),
                            cols].reset_index(drop=True, inplace=False)
        # Main script here
        scale = RobustScaler()
        df = pd.DataFrame(scale.fit_transform(train[cols]), columns=cols)
        #select features based on P values
        ln_model = sm.OLS(y_train, df)
        result = ln_model.fit()
        print(result.summary2())

        pv_cols = cols.values
        SL = 0.051
        pv_cols, LR = self.backwardElimination(df, y_train, SL, pv_cols)

        pred = LR.predict(df[pv_cols])

        y_pred = pred.apply(lambda x: 1 if x > 0.5 else 0)

        print('Fvalue: {:.6f}'.format(LR.fvalue))
        print('MSE total on the train data: {:.4f}'.format(LR.mse_total))

        ls = Lasso(alpha=0.0005,
                   max_iter=161,
                   selection='cyclic',
                   tol=0.002,
                   random_state=101)
        rfecv = RFECV(estimator=ls,
                      n_jobs=-1,
                      step=1,
                      scoring='neg_mean_squared_error',
                      cv=5)
        rfecv.fit(df, y_train)

        select_features_rfecv = rfecv.get_support()
        RFEcv = cols[select_features_rfecv]
        print('{:d} Features Select by RFEcv:\n{:}'.format(
            rfecv.n_features_, RFEcv.values))

        score = r2_score
        ls = Lasso(alpha=0.0005,
                   max_iter=161,
                   selection='cyclic',
                   tol=0.002,
                   random_state=101)
        sbs = SequentialFeatureSelection(ls, k_features=1, scoring=score)
        sbs.fit(df, y_train)

        print('Best Score: {:2.2%}\n'.format(max(sbs.scores_)))
        print('Best score with:{0:2d}.\n'.\
           format(len(list(df.columns[sbs.subsets_[np.argmax(sbs.scores_)]]))))
        SBS = list(df.columns[list(sbs.subsets_[max(
            np.arange(0,
                      len(sbs.scores_))[(sbs.scores_ == max(sbs.scores_))])])])
        print('\nBest score with {0:2d} features:\n{1:}'.format(len(SBS), SBS))

        skb = SelectKBest(score_func=f_regression, k=80)
        skb.fit(df, y_train)
        select_features_kbest = skb.get_support()
        kbest_FR = cols[select_features_kbest]
        scores = skb.scores_[select_features_kbest]

        skb = SelectKBest(score_func=mutual_info_regression, k=80)
        skb.fit(df, y_train)
        select_features_kbest = skb.get_support()
        kbest_MIR = cols[select_features_kbest]
        scores = skb.scores_[select_features_kbest]

        X_train, X_test, y, y_test = train_test_split(df,
                                                      y_train,
                                                      test_size=0.30,
                                                      random_state=101)

        # fit model on all training data
        #importance_type='gain'
        model = XGBRegressor(base_score=0.5,
                             colsample_bylevel=1,
                             colsample_bytree=1,
                             gamma=0,
                             max_delta_step=0,
                             random_state=101,
                             min_child_weight=1,
                             missing=None,
                             n_jobs=4,
                             scale_pos_weight=1,
                             seed=None,
                             silent=True,
                             subsample=1)

        model.fit(X_train, y)

        # Using each unique importance as a threshold
        thresholds = np.sort(np.unique(model.feature_importances_))
        best = 1e36
        colsbest = 31
        my_model = model
        threshold = 0

        for thresh in thresholds:
            # select features using threshold
            selection = SelectFromModel(model, threshold=thresh, prefit=True)
            select_X_train = selection.transform(X_train)
            # train model
            selection_model = XGBRegressor(base_score=0.5,
                                           colsample_bylevel=1,
                                           colsample_bytree=1,
                                           gamma=0,
                                           max_delta_step=0,
                                           random_state=101,
                                           min_child_weight=1,
                                           missing=None,
                                           n_jobs=4,
                                           scale_pos_weight=1,
                                           seed=None,
                                           silent=True,
                                           subsample=1)
            selection_model.fit(select_X_train, y)
            # eval model
            select_X_test = selection.transform(X_test)
            y_pred = selection_model.predict(select_X_test)
            predictions = [round(value) for value in y_pred]
            r2 = r2_score(y_test, predictions)
            mse = mean_squared_error(y_test, predictions)
            print(
                "Thresh={:1.3f}, n={:d}, R2: {:2.2%} with MSE: {:.4f}".format(
                    thresh, select_X_train.shape[1], r2, mse))
            if (best >= mse):
                best = mse
                colsbest = select_X_train.shape[1]
                my_model = selection_model
                threshold = thresh

        feature_importances = [
            (score, feature)
            for score, feature in zip(model.feature_importances_, cols)
        ]
        XGBest = pd.DataFrame(sorted(
            sorted(feature_importances, reverse=True)[:colsbest]),
                              columns=['Score', 'Feature'])
        XGBestCols = XGBest.iloc[:, 1].tolist()

        bcols = set(pv_cols).union(set(RFEcv)).union(set(kbest_FR)).union(
            set(kbest_MIR)).union(set(XGBestCols)).union(set(SBS))
        intersection = set(SBS).intersection(set(kbest_MIR)).intersection(
            set(RFEcv)).intersection(set(pv_cols)).intersection(
                set(kbest_FR)).intersection(set(XGBestCols))
        print(intersection, '\n')
        print('_' * 75, '\nUnion All Features Selected:')
        print('Total number of features selected:', len(bcols))
        print('\n{0:2d} features removed if use the union of selections: {1:}'.
              format(len(cols.difference(bcols)), cols.difference(bcols)))

        totalCols = list(bcols.union(set(colsP)))
        #self.trainingData = self.trainingData.loc[list(totalCols)].reset_index(drop=True, inplace=False)
        #self.testingData = self.testingData.loc[list(totalCols)].reset_index(drop=True, inplace=False)
        #self.combinedData = [self.trainingData, self.testingData]

        return DataObject(self.trainingData, self.testingData,
                          self.combinedData), totalCols, RFEcv, XGBestCols
Ejemplo n.º 38
0
y_feature = 'deferral_payments'

for point, poi in zip(features, labels):
    x = point[features_list.index(x_feature) - 1]
    y = point[features_list.index(y_feature) - 1]
    color = 'red' if poi else 'blue'
    matplotlib.pyplot.scatter(x, y, c=color)

matplotlib.pyplot.xlabel(x_feature)
matplotlib.pyplot.ylabel(y_feature)
matplotlib.pyplot.savefig(x_feature + '_' + y_feature + '.png')

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
features = scaler.fit_transform(features)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [5, 10, None],
    'min_samples_split': [1, 2, 5, 10]
}
    test = test.join(one_hot2)
df.columns

features = list(num_col) + list(catOneHot_col)

# prova con xgboost e crossvalidation
x_train = df[list(features)].values
y_train = df["SPEED_AVG"].values

#si prendono le features e la target variable dal dataset di test
x_test = test[list(features)].values
y_test = test["SPEED_AVG"].values

#scaling
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

gb = XGBRegressor(learning_rate=0.1,
                  n_estimators=2000,
                  max_depth=5,
                  min_child_weight=1,
                  gamma=0,
                  subsample=0.8,
                  colsample_bytree=0.8,
                  objective='reg:gamma',
                  nthread=8,
                  scale_pos_weight=1,
                  seed=27)

# Nel caso si voglia usare la Cross Validation
Ejemplo n.º 40
0
df_non_clicks = df.loc[df['clicks'] == 0][:number_of_clicks]
df_balanced = pd.concat([df_clicks, df_non_clicks])
#%%
#Encoding categorical data using the "hashing trick"

vectorizer = FeatureHasher(n_features=2**25, input_type='string')
invent_src = vectorizer.fit_transform(df_balanced.inventory_source)
#geo_zip = vectorizer.fit_transform(df_balanced.geo_zip.apply(str))
screen_size = vectorizer.fit_transform(df_balanced.platform_device_screen_size)
carrier = vectorizer.fit_transform(df_balanced.platform_carrier)
bandwidth = vectorizer.fit_transform(df_balanced.platform_bandwidth)
maker = vectorizer.fit_transform(df_balanced.platform_device_make)
model = vectorizer.fit_transform(df_balanced.platform_device_model)
day_of_week = vectorizer.fit_transform(df_balanced.day_of_week)
scaler = RobustScaler()#StandardScaler()
bid_floor = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.bid_floor.values])))
#spend = np.transpose(csr_matrix(scaler.fit_transform([df_balanced.spend.values])))

#%%
y = df_balanced['clicks']
X = hstack([invent_src, screen_size, carrier, bandwidth, maker, model, day_of_week, bid_floor])
#%%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model = LogisticRegression(solver='saga',n_jobs=8, penalty='l2', verbose=5,C=0.01)
model.fit(X_train, y_train)
mm.model_report_card(model, X_train, y_train, X_test, y_test)




import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

datadir = '~/Desktop/my package/machine_learning-and-deep_learning/Data/houseprice/'
X_train = pd.read_csv(datadir + 'X2.csv').drop('SalePrice', axis=1)
y_train = pd.read_csv(datadir + 'X2.csv')['SalePrice']
X_test = pd.read_csv(datadir + 'test_X2.csv')

from sklearn.preprocessing import RobustScaler
scalor = RobustScaler()
X = scalor.fit_transform(X_train)
test_X = scalor.fit_transform(X_test)
Id = list(range(1461, 1461 + 1459))

print(X.shape, test_X.shape)

#Base line croos validation error
lr = LinearRegression()
lr_val_score = cross_val_score(lr,
                               X,
                               y_train,
                               scoring='neg_mean_absolute_error',
                               cv=10,
                               n_jobs=-1)
print('Baseline log CV score:', np.log(np.mean(-lr_val_score)))
Ejemplo n.º 42
0
def pca_sets(sample_length=1000, num_samples=1000, random_state=69):
    real_set_1_ = load_n_samples(real=True,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=random_state)
    real_set_2_ = load_n_samples(real=True,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=2 * random_state)
    fake_set_1_ = load_n_samples(real=False,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=random_state)
    fake_set_2_ = load_n_samples(real=False,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=2 * random_state)
    real_set_3_ = load_n_samples(real=True,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=3 * random_state)
    real_set_4_ = load_n_samples(real=True,
                                 num_samples=num_samples,
                                 sample_length=sample_length,
                                 random_state=4 * random_state)

    r_scaler = RobustScaler()

    for sample in range(num_samples):
        real_set_1_[sample] = r_scaler.fit_transform(real_set_1_[sample])
        real_set_2_[sample] = r_scaler.fit_transform(real_set_2_[sample])
        fake_set_1_[sample] = r_scaler.fit_transform(fake_set_1_[sample])
        fake_set_2_[sample] = r_scaler.fit_transform(fake_set_2_[sample])
        real_set_3_[sample] = r_scaler.fit_transform(real_set_3_[sample])
        real_set_4_[sample] = r_scaler.fit_transform(real_set_4_[sample])

    real_set_1 = np.zeros((num_samples, N_COLS, N_COLS))
    real_set_2 = np.zeros((num_samples, N_COLS, N_COLS))
    fake_set_1 = np.zeros((num_samples, N_COLS, N_COLS))
    fake_set_2 = np.zeros((num_samples, N_COLS, N_COLS))
    real_set_3 = np.zeros((num_samples, N_COLS, N_COLS))
    real_set_4 = np.zeros((num_samples, N_COLS, N_COLS))

    pca = PCA()
    for x in range(num_samples):
        pca.fit(real_set_1_[x])
        real_set_1[x] = pca.components_

        pca.fit(real_set_2_[x])
        real_set_2[x] = pca.components_

        pca.fit(fake_set_1_[x])
        fake_set_1[x] = pca.components_

        pca.fit(fake_set_2_[x])
        fake_set_2[x] = pca.components_

        pca.fit(real_set_3_[x])
        real_set_3[x] = pca.components_

        pca.fit(real_set_4_[x])
        real_set_4[x] = pca.components_
    return real_set_1, real_set_2, real_set_3, real_set_4, fake_set_1, fake_set_2
Ejemplo n.º 43
0
for i in range(np.size(portret, 0)):
    for j in range(np.size(portret, 1)):
        if portret[i, j] == -99.99:
            portret[i, j] = np.nan
#%% Price the cross-section
dates = pd.DataFrame({'Date': Date})
df2 = df.merge(dates, how='inner', on='Date')
df3 = df2.merge(ff3, how='inner', on='Date')
# Define feature
riskfac = df3.Close_vix.values - df3.Close_vix3m.values

rf = df3.RF.values
m = np.zeros(np.size(portret, 1))

X = np.vstack((robust_scaler.fit_transform(riskfac.reshape(-1, 1)).T,
               robust_scaler.fit_transform(df3.MKTRF.values.reshape(-1, 1)).T,
               robust_scaler.fit_transform(df3.SMB.values.reshape(-1, 1)).T,
               robust_scaler.fit_transform(df3.HML.values.reshape(-1, 1)).T)).T

numFac = np.size(X, 1)
b = np.zeros((np.size(portret, 1), numFac))

X = sm.add_constant(X)
# Obtain betas from first-pass time-series regressions
for i in range(np.size(portret, 1)):
    y = portret[:, i] - rf  # LHS variable is excess returns
    m[i] = y.mean(
    )  # store expected excess returns for the cross-sectional regressions
    model = sm.OLS(
        y, X, missing='drop'
predictions_df['predictions'] = np.nan

mae_cv = np.zeros((n_folds, 1))

# --------------------------------------------------------------------------
for i_fold, (train_idx, test_idx) in enumerate(kf.split(x, y)):
    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    print('CV iteration: %d' % (i_fold + 1))

    # --------------------------------------------------------------------------
    # Normalization/Scaling/Standardization
    scaler = RobustScaler()

    x_train_norm = scaler.fit_transform(x_train)
    x_test_norm = scaler.transform(x_test)

    # --------------------------------------------------------------------------
    # Model
    gpr = GaussianProcessRegressor()

    # --------------------------------------------------------------------------
    # Model selection
    # Search space
    param_grid = [
        {
            'kernel': [RBF(), DotProduct()],
            'alpha': [1e0, 1e-1, 1.5e-1, 1e-2, 1.5e-2]
        },
    ]
Ejemplo n.º 45
0
def transform_data(X, y=None, test=False):
    """
    Preparing final dataset with all features.

    Arguments
    ---
    X - dataframe with preprocessed features and target variable
    test - boolean; if false, it means X is the training set
           If true, it means X is the test set

    """
    config = load_yaml("./config.yaml")

    columns = list(X.columns)

    log_cols = config["transform"]["log_cols"]
    log1p_cols = config["transform"]["log1p_cols"]
    boxcox1p_cols = config["transform"]["boxcox1p_cols"]
    onehot_cols = config["transform"]["onehot_cols"]
    targetencode_cols = config["transform"]["targetencode_cols"]
    log_target = config["transform"]["log_target"]

    # generate time features (only relevant for time series)
    # TODO: make datetime column identifiable from config file
    if "timestamp" in columns:
        X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
        # adjust the desirable format accordingly
        X["hour"] = X.timestamp.dt.hour
        X["weekday"] = X.timestamp.dt.weekday

        if not test:
            X.sort_values("timestamp", inplace=True)
            X.reset_index(drop=True, inplace=True)

    # TODO: make cols identified from config file

    if log_cols:
        for col in log_cols:
            # this will replace the columns with their log values
            X[col] = np.log(X[col])

    if log1p_cols:
        for col in log1p_cols:
            # this will replace the columns with their log1p values
            X[col] = np.log1p(X[col])

    if boxcox1p_cols:
        for col in boxcox1p_cols:
            if col in columns:
                print("taking the log of " + str(col))
                # this will replace the columns with their boxcox1p values
                X[col] = boxcox1p(X[col], 0.15)

    # robust scaler
    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    if not test:
        global robust_scaler
        robust_scaler = RobustScaler()
        robust_scaler.fit_transform(X[numeric_cols])
    else:
        robust_scaler.transform(X[numeric_cols])

    # transforming target
    if log_target and not test:
        y = np.log1p(y)

    # target encoding
    if targetencode_cols:
        if not test:
            global target_encoder
            target_encoder = ce.TargetEncoder(cols=targetencode_cols)
            X = target_encoder.fit_transform(X, y)
        else:
            X = target_encoder.transform(X)

    if test:
        return X
    else:
        return X, y
Ejemplo n.º 46
0
def robust_modified(df):
    robust_scaler = RobustScaler()
    robust_df = pd.DataFrame(robust_scaler.fit_transform(df[zone_columns]), columns=[zone_columns])
    new_column = [x+'_robust' for x in zone_columns]
    df[new_column] = robust_df
    return df, new_column
Ejemplo n.º 47
0
def main(args):
    out_file_name = "results.log"

    if args.classify:
        # Cast to list to keep it all in memory
        train = list(csv.reader(open(args.train_file, 'r')))
        test = list(csv.reader(open(args.test_file, 'r')))

        x_train = np.array(train[1:], dtype=float)
        
        x_test = np.array(test[1:], dtype=float)
        
        train_labels_file = open(args.train_labels)
        y_train = np.array([int(x.strip()) for x in train_labels_file.readlines()])

        test_labels_file = open(args.test_labels)
        y_test = np.array([int(x.strip()) for x in test_labels_file.readlines()])
        train_labels_file.close()
        test_labels_file.close()

        if args.sampling_technique:
            print "Attempting to use sampling technique: " + args.sampling_technique
            if args.sampling_ratio == float('NaN'):
                print "Unable to use sampling technique. Ratio is NaN."
            else:
                x_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     x_train, y_train,
                                                                     args.sampling_ratio)

        if args.scale:
            scaler = RobustScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.fit_transform(x_test)
        for classifier in args.classifiers:
            model = __get_classifier_model(classifier, args)
            print "Using classifier " + classifier
            print "Fitting data to model"
            if args.grid_search:
                print "Applying parameter tuning to model"
                if classifier == LOG_REG:
                    parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == SVM:
                    parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == ADA_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[13]}
                    model = grid_search.GridSearchCV(model, parameters, scoring=roc_auc_score, verbose=2)
                elif classifier == RF:
                    parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == GRADIENT_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == EXTRA_TREES:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == BAGGING:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'max_samples': [.4, 30],'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False], 'n_jobs':[-1]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                print "Best params: " + str(model.best_params_)
                    
            clf = model.fit(x_train, y_train)
            print "Parameters used in model:"
            #print clf.get_params(deep=False)
            if args.select_best:
                # Unable to use BaggingClassifier with SelectFromModel
                if classifier != BAGGING:
                    print "Selecting best features"
                    sfm = SelectFromModel(clf, prefit=True)
                    x_train = sfm.transform(x_train)
                    x_test = sfm.transform(x_test)
                    clf = model.fit(x_train, y_train)
            __print_and_log_results(clf, classifier, x_train, x_test, y_test,
                                    out_file_name, args)

    elif args.cross_validate:
        # Cast to list to keep it all in memory
        labels_file = open(args.labels)
        labels = np.array([int(x.strip()) for x in labels_file.readlines()])
        labels_file.close()
        data_file = open(args.data_file, 'r')
        data = list(csv.reader(data_file))
        data_file.close()
        examples = np.array(data[1:], dtype=float)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(examples, labels, test_size=0.1)

        if args.sampling_technique:
            print "Attempting to use sampling technique: " + args.sampling_technique
            if args.sampling_ratio == float('NaN'):
                print "Unable to use sampling technique. Ratio is NaN."
            else:
                X_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     X_train, y_train,
                                                                     args.sampling_ratio)
        if args.scale:
            scaler = StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        for classifier in args.classifiers:
            print "Using classifier " + classifier
            model = __get_classifier_model(classifier, args)
            print "Fitting model"
            if args.grid_search:
                print "Applying parameter tuning to model"
                if classifier == LOG_REG:
                    parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == SVM:
                    parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == ADA_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[13]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == RF:
                    parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == GRADIENT_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == EXTRA_TREES:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == BAGGING:
                    #parameters = {'n_estimators' : [400], 'random_state' : [17],
                    #              'max_samples' : np.arange(0.5, 0.9, 0.1),
                    #              'max_features' : np.arange(0.5, 0.9, 0.1),
                    #              'bootstrap':[False], 'bootstrap_features':[False], 'n_jobs':[-1]}
                    parameters = {"base_estimator__criterion" : ["gini", "entropy"],
                                  "base_estimator__splitter" : ["best", "random"],
                                  "base_estimator__max_depth" : [10, 15, 20, 25], 
                                  "base_estimator__class_weight" : ['balanced'],
                                  "base_estimator__max_features" : ['auto', 'log2']
                                  }
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
            clf = model.fit(X_train, y_train)
            if args.grid_search:
                print "Best params: " + str(model.best_params_)
            if args.select_best:
                if classifier != BAGGING:
                    print "Selecting best features"
                    sfm = SelectFromModel(clf, prefit = True)
                    X_train = sfm.transform(X_train)
                    X_test = sfm.transform(X_test)
                    clf = model.fit(X_train, y_train)
            print "Evaluating results"
            __print_and_log_results(clf, classifier, X_train, X_test, y_test,
                                    out_file_name, args)
    elif args.kfold:
        # Cast to list to keep it all in memory
        data_file = open(args.data_file, 'r')
        data = list(csv.reader(data_file))
        data_file.close()
        labels_file = open(args.labels)
        labels = np.array([int(x.strip()) for x in labels_file.readlines()])
        labels_file.close()
        X = np.array(data[1:], dtype=float)
        kf = KFold(len(X), n_folds=10, shuffle=True, random_state=42)
        for train, test in kf:
            print "kfold loop iterate"
            X_train, X_test, y_train, y_test = X[train], X[test], labels[train], labels[test]

            if args.sampling_technique:
                print "Attempting to use sampling technique: " + args.sampling_technique
                if args.sampling_ratio == float('NaN'):
                    print "Unable to use sampling technique. Ratio is NaN."
                else:
                    X_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     X_train, y_train,
                                                                     args.sampling_ratio)
            if args.scale:
                scaler = StandardScaler().fit(X_train)
                X_train = scaler.transform(X_train)
                X_test = scaler.transform(X_test)

            for classifier in args.classifiers:
                print "Using classifier " + classifier
                model = __get_classifier_model(classifier, args)
                print "Fitting model"
                clf = model.fit(X_train, y_train)
                if args.select_best:
                    if classifier != BAGGING:
                        sfm = SelectFromModel(clf, prefit = True)
                        X_train = sfm.transform(X_train)
                        X_test = sfm.transform(X_test)
                        clf = model.fit(X_train, y_train)
                print "Evaluating results"
                __print_and_log_results(clf, classifier, X_train, X_test, y_test,
                                        out_file_name, args)
        print "kfold loop done"
Ejemplo n.º 48
0
"""### 데이터 스케일링


"""

import pandas as pd

dataframe = pd.DataFrame(train_dataset)
dataframe.to_csv("meas_train_dataset.csv", header=False, index=False)

print(train_dataset)

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

scaled_train_data = scaler.fit_transform(train_dataset)
scaled_test_data = scaler.transform(test_dataset)

print(scaled_train_data)

"""## 모델"""

from kerastuner.tuners import RandomSearch

def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=32,
                                            #max_value=512,
                                            max_value=64,
Ejemplo n.º 49
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler 
from sklearn import metrics
from sklearn import cross_validation 

rscaler = RobustScaler()
air_frame = pd.read_csv('airfoil_self_noise.dat',sep='\t')
column_names = ['Frequency','Attack Angle','Chord Length','Free Velocity','Suction Side','Scaled Sound']
air_frame.column = column_names
scaled_data = rscaler.fit_transform(air_frame.values)
X = scaled_data[:,:5]
Y = scaled_data[:,5]

train_data,test_data,train_regressor,test_regressor = cross_validation.train_test_split(X,Y,test_size=0.3)
rf = RandomForestRegressor()
grad = GradientBoostingRegressor()
bag = BaggingRegressor()
ada = AdaBoostRegressor()
bayes = BayesianRidge()
svr = SVR()
lin_reg = LinearRegression()

regressors_names = ['Random Forests','Gradient Boost','Bagging','Ada Boost','Bayesian Ridge','SVR','Linear Reg']
regressors = [rf,grad,bag,ada,bayes,svr,lin_reg]
Ejemplo n.º 50
0
df_neg = pd.read_csv('NegativeYH.csv', header=None)

df_neg['Status'] = 0
df_pos['Status'] = 1
df_neg = df_neg.sample(n=len(df_pos))

df = pd.concat([df_pos, df_neg])
df = df.reset_index()
df = df.sample(frac=1)
df = df.iloc[:, 1:]

X = df.iloc[:, 0:1986].values
y = df.iloc[:, 1986:].values

scaler = RobustScaler()
X = scaler.fit_transform(X)
kf = StratifiedKFold(n_splits=5)

accuracy = []
specificity = []
sensitivity = []
precision = []
recall = []
m_coef = []

auc_list = []
Rf_fpr_list = []
Rf_tpr_list = []
o = 0
max_accuracy = float("-inf")
Rf_fpr = None
             'y_SN_2', 'log_y_err_SN_2']

feat_SN_3 = ['g_SN_3', 'log_g_err_SN_3', 'r_SN_3', 'log_r_err_SN_3',
             'i_SN_3', 'log_i_err_SN_3', 'z_SN_3', 'log_z_err_SN_3',
             'y_SN_3', 'log_y_err_SN_3']

feat_SN_4 = ['g_SN_4', 'log_g_err_SN_4', 'r_SN_4', 'log_r_err_SN_4',
             'i_SN_4', 'log_i_err_SN_4', 'z_SN_4', 'log_z_err_SN_4',
             'y_SN_4', 'log_y_err_SN_4']

feat_SN_5 = ['g_SN_5', 'log_g_err_SN_5', 'r_SN_5', 'log_r_err_SN_5',
             'i_SN_5', 'log_i_err_SN_5', 'z_SN_5', 'log_z_err_SN_5',
             'y_SN_5', 'log_y_err_SN_5']

###  training features with robust scaler ###
X_train = RS.fit_transform(df_train[feat_train])

### validation features in different noise levels ###
X_valid_SN_1 = RS.transform(df_valid[feat_SN_1])
X_valid_SN_2 = RS.transform(df_valid[feat_SN_2])
X_valid_SN_3 = RS.transform(df_valid[feat_SN_3])
X_valid_SN_4 = RS.transform(df_valid[feat_SN_4])
X_valid_SN_5 = RS.transform(df_valid[feat_SN_5])

### The targets that we wish to learn ###
Y_train = df_train['redshift']
Y_valid = df_valid['redshift']

### Some scaling of the target between 0 and 1 ###
### so we can model it with a beta function ###
### given that Beta function is not defined ###
        elif ei > 4:
            exps = [expinds[ei - 4]]

        it = 0
        while it < itmax:
            df_all = pd.DataFrame()
            for exp in exps:
                print(exp)
                fname = fold + 'damage_' + exp + '_s25.txt'
                print('input file: ', fname)

                df_sm = pd.read_csv(fname, delim_whitespace=True)
                df = df_sm.dropna().copy()
                trans = RobustScaler()

                df[features] = trans.fit_transform(df[features].values)
                df_all = df_all.append(df, ignore_index=True)
                #df_all = df.copy()

            # random split train/test
            inds = np.random.uniform(0, 1, len(df_all)) <= .80

            df_all['is_train'] = inds
            train, test = df_all[df_all['is_train'] == True], df_all[
                df_all['is_train'] == False]

            x_train = train[features]
            y_train = train[pred_str]

            x_test = test[features]
            y_test = test[pred_str]
# In[11]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=42)

# In[12]:

from sklearn.preprocessing import RobustScaler

# In[13]:

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# # Training using various models

# In[14]:

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=0.5)
log = model.fit(X_train, y_train)

# In[15]:

np.set_printoptions(precision=5)
pred = log.predict_proba(X_test)
Ejemplo n.º 54
0
        #Print the elapsed time and celebrate that a trained network has been made!
        elapsed = time.time() - start
        print("Elapsed Time: {:.3f}".format(elapsed))
        print('Finished Training')

        #Save the trained network
        torch.save(self.net, self.runName + "_Future_Model.pth")


if __name__ == '__main__':

    #Command Line Argument Parser
    parser = argparse.ArgumentParser()
    #parser.add_argument('-td', '--trainData')
    parser.add_argument('-rn', '--runName')
    args = parser.parse_args()

    allData = jb.load('lstm_Data.joblib')

    scaler = RobustScaler()
    y = scaler.fit_transform(allData[1])

    #Initialize Runner obj and run training cycle
    #needs: trainingData - dataframe of all training data
    futureNet = Runner(allData[0], y, runName=args.runName)
    futureNet.train()

#Save log, diabled temporarily until review is finished
#saveLog(log_Path, iden, experiment['datapath'], net.arch_Name, finalEpoch, true, pred, net.seed, elapsed, str(args.runName), net)
Ejemplo n.º 55
0
#aa = X.groupby('VisitNumber').groups   
#X_new = pd.DataFrame(columns = X.keys())    
#for key in aa.keys():
#    X_new = X_new.append(X.iloc[aa[key],:].mean(),ignore_index=True)    
#%%    


#%%
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler


standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

X_train = robust_scaler.fit_transform(aa)
X_train1 = standard_scaler.fit_transform(aa)


#%% for the test data

X_test = testData

for col in colName:
    X_test[col] = abs((X_test[col].apply(hash))%2**(16))
    
#%%    
print ("handle missing data")
X_test.fillna(X_test.mean(),inplace=True)

Ejemplo n.º 56
0
default.drop('education', axis=1, inplace=True)

default['male'] = (default['sex']==1).astype('int')
default.drop('sex', axis=1, inplace=True)

#default['married'] = (default['marraige'] == 1).astype('int')
#default.drop('marraige', axis=1, inplace=True)
#for pay features if the <=0 then it means it was not delayed
pay_features = ['pay_0','pay_2','pay_3','pay_4','pay_5','pay_6']
for p in pay_features:
     default.loc[default[p]<=0, p] = 0
default.rename(columns={'default payment next month':'default'}, inplace=True)
target_name= 'default'
X = default.drop('default' , axis=1)
robust_scaler = RobustScaler()
x = robust_scaler.fit_transform(X)
y= default[target_name]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=123, stratify=y)

def CMatrix(CM, labels=['pay','default']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name='TRUE'
    df.columns.name='PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df
metrics= pd.DataFrame(index=['accuracy','precision', 'recall'],
                      columns=['NULL', 'LogisticReg','ClassTree', 'NaiveBayes'])
y_pred_test= np.repeat(y_train.value_counts().idxmax(), y_test.size)
metrics.loc['accuracy','NULL'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['precision','NULL'] = precision_score(y_pred=y_pred_test, y_true=y_test)
Ejemplo n.º 57
0
del result['unique_id']
#%% handle missing value
print ("handle missing data")
result.fillna(result.mean(),inplace=True)



#%% data preprocessing
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, RobustScaler


standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

X_train = robust_scaler.fit_transform(result)
X_train1 = standard_scaler.fit_transform(result)
#%% performace 


def performence(clf,train,label,clfName):
    re = cross_validation.ShuffleSplit(train.shape[0],n_iter=10,test_size =0.25,random_state =43)
    
    aucList = []
    accuracyList = []
    for train_index, test_index in re:
        clf.fit(train.iloc[train_index,:],y.iloc[train_index])
        pre_y = clf.predict_proba(train.iloc[test_index,:])  # probablity to get the AUC
        aucList.append(roc_auc_score(y.iloc[test_index],pre_y[:,1]))
        y_pred = clf.predict(train.iloc[test_index,:]) # get the accuracy of model 
        accuracyList.append(accuracy_score(y.iloc[test_index],y_pred))  
Ejemplo n.º 58
0
                                                            test_size=0.1,
                                                            random_state=6)
ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x,
                                                    ca_y,
                                                    test_size=0.1,
                                                    random_state=6)
na_x, na_x_test, na_y, na_y_test = train_test_split(na_x,
                                                    na_y,
                                                    test_size=0.1,
                                                    random_state=6)

# scalling
scaler = RobustScaler()
# scaler = MinMaxScaler()

hhb_x = scaler.fit_transform(hhb_x)
hhb_x_test = scaler.transform(hhb_x_test)
x_pred_hhb = scaler.transform(x_pred_hhb)

hbo2_x = scaler.fit_transform(hbo2_x)
hbo2_x_test = scaler.transform(hbo2_x_test)
x_pred_hbo2 = scaler.transform(x_pred_hbo2)

ca_x = scaler.fit_transform(ca_x)
ca_x_test = scaler.transform(ca_x_test)
x_pred_ca = scaler.transform(x_pred_ca)

na_x = scaler.fit_transform(na_x)
na_x_test = scaler.transform(na_x_test)
x_pred_na = scaler.transform(x_pred_na)
# # Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$

# In[5]:

mms = MinMaxScaler()
views['minmax'] = mms.fit_transform(views[['views']])
views


# In[6]:

(vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw))


# # Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$

# In[7]:

rs = RobustScaler()
views['robust'] = rs.fit_transform(views[['views']])
views


# In[8]:

quartiles = np.percentile(vw, (25., 75.))
iqr = quartiles[1] - quartiles[0]
(vw[0] - np.median(vw)) / iqr

Ejemplo n.º 60
0
class Learned(Model):

    def __init__(self, *args, scale=False, center=False, **kwargs):
        """
        A machine learned model.  Beyond :class:`revscoring.Model`, this
        "Learned" models implement
        :func:`~revscoring.scoring.models.Learned.fit` and
        :func:`~revscoring.scoring.models.Learned.cross_validate`.
        """
        super().__init__(*args, **kwargs)
        self.trained = None
        if scale or center:
            self.scaler = RobustScaler(with_centering=center,
                                       with_scaling=scale)
        else:
            self.scaler = None

        self.params.update({
            'scale': scale,
            'center': center
        })

    def train(self, values_labels):
        """
        Fits the model using labeled data by learning its shape.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                :class:`revscoring.Feature` s provided to the constructor
        """
        raise NotImplementedError()

    def fit_scaler_and_transform(self, fv_vectors):
        """
        Fits the internal scale to labeled data.

        :Parameters:
            fv_vectors : `iterable` (( `<feature_values>`, `<label>` ))
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor

        :Returns:
            A dictionary of model statistics.
        """
        if self.scaler is not None:
            return self.scaler.fit_transform(fv_vectors)
        else:
            return fv_vectors

    def apply_scaling(self, fv_vector):
        if self.scaler is not None:
            if not hasattr(self.scaler, "center_") and \
               not hasattr(self.scaler, "scale_"):
                raise RuntimeError("Cannot scale a vector before " +
                                   "training the scaler")
            fv_vector = self.scaler.transform([fv_vector])[0]

        return fv_vector

    def _clean_copy(self):
        raise NotImplementedError()

    def cross_validate(self, values_labels, folds=10, processes=1):
        """
        Trains and tests the model agaists folds of labeled data.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor
            folds : `int`
                When set to 1, cross-validation will run in the parent thread.
                When set to 2 or greater, a :class:`multiprocessing.Pool` will
                be created.
        """
        folds_i = KFold(n_splits=folds, shuffle=True,
                        random_state=0)
        if processes == 1:
            mapper = map
        else:
            pool = Pool(processes=processes or cpu_count())
            mapper = pool.map
        results = mapper(self._cross_score,
                         ((i, [values_labels[i] for i in train_i],
                           [values_labels[i] for i in test_i])
                          for i, (train_i, test_i) in enumerate(
                              folds_i.split(values_labels))))
        agg_score_labels = []
        for score_labels in results:
            agg_score_labels.extend(score_labels)

        self.info['statistics'].fit(agg_score_labels)

        return self.info['statistics']

    def _cross_score(self, i_train_test):
        i, train_set, test_set = i_train_test
        logger.info("Performing cross-validation {0}...".format(i + 1))
        model = self._clean_copy()
        logger.debug("Training cross-validation for {0}...".format(i + 1))
        model.train(train_set)
        logger.debug("Scoring cross-validation for {0}...".format(i + 1))
        feature_values, labels = map(list, zip(*test_set))
        docs = model.score_many(feature_values)
        return list(zip(docs, labels))