def process_marg(npos, i, pos, ftarget, fm, fv, fi, fd, fref, fref_trunc,
                 ftarget_trunc, lock):
    top20d = np.argsort(ftarget)[-20:]
    if "nat" in synth_nat:
        top20d_trunc = np.argsort(ftarget_trunc)[
            -20:]  # different target, so sort by that
        #top20s = np.argsort(fs)[-20:]

    with lock:
        print("npos {:3d},  set {:3d},   n_unique={:10d}".format(
            npos, i, ftarget.shape[0]))
        with open(output_dir + "/r20_{}".format(npos), "at") as f:
            if "nat" in synth_nat:
                # only nat uses target_trunc and ref_trunc for black line
                print(
                    pr(ftarget[top20d], fm[top20d]),  # target vs mi3   
                    pr(ftarget[top20d], fv[top20d]),  # targt vs vae
                    pr(ftarget[top20d], fi[top20d]),  # target vs indep
                    pr(ftarget[top20d], fd[top20d]),  # target vs deep
                    # pr(ftarget[top20d], fp[top20d]), # target vs progen
                    pr(ftarget_trunc[top20d_trunc],
                       fref_trunc[top20d_trunc]),  # target_trunc vs ref_trunc
                    file=f)
            # both synthetics, 10K and 1M, use normal target and ref for black line
            else:
                print(
                    pr(ftarget[top20d], fm[top20d]),  # target vs mi3   
                    pr(ftarget[top20d], fv[top20d]),  # targt vs vae
                    pr(ftarget[top20d], fi[top20d]),  # target vs indep
                    pr(ftarget[top20d], fd[top20d]),  # target vs deep
                    # pr(ftarget[top20d], fp[top20d]), # target vs progen
                    pr(ftarget[top20d], fref[top20d]),  # target vs ref
                    file=f)
    def fit(self, X, y, var_smoothing=1e-9): # add a smoothing parameter for calculus stability  
        
        # the prior distribution 
        self.priors = {}
        
        # paramters of the prior distribution
        self.params = {}
        
        # create a dictionoary for the mutual information
        self.mu_info = {}
        
        #get the set opf classes in the target variable y
        self.classes = list(sorted(set(y)))
        
        for c in self.classes:
            #apply the coditionning to label
            x = X[y==c]
            
            self.params[c] = {
                'mean': x.mean(axis=0),
                'variance': x.var(axis=0) + var_smoothing 
                }  
            
            self.mu_info[c] = {}
            self.corr = {}
            self.priors[c] = len(y[y==c])/len(y)

            for i in range(X.shape[1]):
                for j in range(i+1, X.shape[1]):
                    self.mu_info[c][str(i)+','+str(j)] =  -0.5*np.log(1-pr(x[:,i], x[:,j])[0])
                    self.corr[str(i)+','+str(j)] = pr(x[:,i], x[:,j])[0]
            
        # repalce nan values with 0
        self.corr = {k: 0 if np.isnan(v) else v for k, v in self.corr.items()}
        self.mu_info = {k1: {k2: 0 if np.isnan(v2) else v2 for k2, v2 in v1.items()} \
                    for k1, v1 in self.mu_info.items()}

        # having mutual info, we can now construct the len(classes) trees with Kruskal algorithm 
        self.Trees = []
        for c in self.classes:
          # create as much nodes as much features in the training set
          g = Graph(X.shape[1])

          #create the tree associated with label c
          for k, v_ in self.mu_info[c].items():
            # to get the indexes of features the mutual info to hava a suitable
            # argument for the add_graph method
            u, v, w = int(k.split(",")[0]), int(k.split(",")[1]), v_
            g.add_edge(u, v, w)

          # append the tree structure into Trees
          self.Trees.append(g.kruskal_algo())
            
        self.fitted = True
def compute_pearson_and_spearman_r(A, B, n_pool, n_test):
    assert A.shape[0] == n_pool + n_test
    A_diag = np.diag(A)[:n_pool].tolist()
    B_diag = np.diag(B)[:n_pool].tolist()

    A_pool_test = A[:n_pool][:, n_pool:]
    B_pool_test = B[:n_pool][:, n_pool:]
    A_offdiag = np.reshape(A_pool_test, -1).tolist()
    B_offdiag = np.reshape(B_pool_test, -1).tolist()

    pr_diag, pr_diag_p = pr(A_diag, B_diag)
    pr_offdiag, pr_offdiag_p = pr(A_offdiag, B_offdiag)

    spr_diag, spr_diag_p = spr(A_diag, B_diag)
    spr_offdiag, spr_offdiag_p = spr(A_offdiag, B_offdiag)
    return pr_diag, pr_offdiag, spr_diag, spr_offdiag, pr_diag_p, pr_offdiag_p, spr_diag_p, spr_offdiag_p
Esempio n. 4
0
    def run_model(self):
        if self.grid_search:
            self.run_grid_search()
            self.parameters = self.best_params
        
        '''
        Xtr, Xts, ytr, yts, CT_RT_tr, CT_RT_ts = train_test_split(self.X, 
                self.y, self.CT_RT, test_size=self.test_size)
        '''
        if self.cv == 'loo':
            cv = LeaveOneOut()
        else:
            cv = KFold(n_splits=self.cv, shuffle=True)

        self.rmse_cv_train = []
        self.r2_cv_train = []
        self.rmse_cv_test = []
        self.r2_cv_test = []
        self.pr_cv_train = []
        self.pr_cv_test = []
        if self.CT_RT is not None:
            self.rmse_CT_RT_cv_train = []
            self.r2_CT_RT_cv_train = []
            self.rmse_CT_RT_cv_test = []
            self.r2_CT_RT_cv_test = []
            self.pr_CT_RT_cv_train = []
            self.pr_CT_RT_cv_test = []
        
        est = {'lightgbm': lgb.LGBMRegressor,
               'catboost': catboost.CatBoostRegressor,
               'xgboost': xgboost.XGBRegressor}
        
        self.model = []
        self.lin_model = LinearRegression().fit(self.X_lin, self.y)
        self.y_lin = self.lin_model.predict(self.X_lin)
        self.y_res = self.y - self.y_lin
        model = est[self.package](**self.parameters)
        for n, (tr_id, ts_id) in enumerate(cv.split(self.y)):
            print('Running Validation {} of {}'.format(n, self.cv))
            if self.package == 'lightgbm':
                self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id],
                    eval_set=[(self.X[ts_id], self.y_res[ts_id])],
                    eval_metric='rmse', early_stopping_rounds=20,
                    feature_name=self.feature_names))
            elif self.package == 'xgboost':
                self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id],
                    eval_set=[(self.X[ts_id], self.y_res[ts_id])],
                    eval_metric='rmse', early_stopping_rounds=20))
            else:
                self.model.append(model.fit(self.X[tr_id], self.y_res[tr_id],
                    eval_set=[(self.X[ts_id], self.y_res[ts_id])],
                    early_stopping_rounds=20))
            if self.package == 'lightgbm':
                self.y_cv_tr_pred = self.y_lin[tr_id] + self.model[-1].predict(
                        self.X[tr_id], num_iteration=self.model[-1].best_iteration_)
                self.y_cv_ts_pred = self.y_lin[ts_id] + self.model[-1].predict(
                        self.X[ts_id], num_iteration=self.model[-1].best_iteration_)
                if self.model_scheme == 'LMP':
                    self.CT_RT_cv_tr_pred = np.exp((
                        self.y_cv_tr_pred*1000/self.CT_Temp[tr_id]) 
                        - self.C[tr_id])
                    self.CT_RT_cv_ts_pred = np.exp((
                        self.y_cv_ts_pred*1000/self.CT_Temp[ts_id]) 
                        - self.C[ts_id])
            else:
                self.y_cv_tr_pred = self.y_lin[tr_id] + self.model[-1].predict(
                        self.X[tr_id])
                self.y_cv_ts_pred = self.y_lin[ts_id] + self.model[-1].predict(
                        self.X[ts_id])
                if self.model_scheme == 'LMP':
                    self.CT_RT_cv_tr_pred = np.exp((
                        self.y_cv_tr_pred*1000/self.CT_Temp[tr_id]) 
                        - self.C[tr_id])
                    self.CT_RT_cv_ts_pred = np.exp((
                        self.y_cv_ts_pred*1000/self.CT_Temp[ts_id]) 
                        - self.C[ts_id])
            self.y_cv_tr = self.y[tr_id]
            self.y_cv_ts = self.y[ts_id]
            if self.CT_RT is not None:
                self.CT_RT_cv_tr = self.CT_RT[tr_id]
                self.CT_RT_cv_ts = self.CT_RT[ts_id]
            self.rmse_cv_train.append(np.sqrt(mean_squared_error(
                self.y_cv_tr_pred, self.y[tr_id])))
            self.rmse_cv_test.append(np.sqrt(mean_squared_error(
                self.y_cv_ts_pred, self.y[ts_id])))
            self.r2_cv_train.append(linregress(self.y_cv_tr_pred, 
                self.y[tr_id])[2]**2)
            self.r2_cv_test.append(linregress(self.y_cv_ts_pred, 
                self.y[ts_id])[2]**2)
            self.pr_cv_train.append(pr(self.y_cv_tr_pred, self.y[tr_id]))
            self.pr_cv_test.append(pr(self.y_cv_ts_pred, self.y[ts_id]))
            if self.CT_RT is not None:
                self.rmse_CT_RT_cv_train.append(np.sqrt(mean_squared_error(
                    self.CT_RT_cv_tr_pred, self.CT_RT[tr_id])))
                self.rmse_CT_RT_cv_test.append(np.sqrt(mean_squared_error(
                    self.CT_RT_cv_ts_pred, self.CT_RT[ts_id])))
                self.r2_CT_RT_cv_train.append(linregress(self.CT_RT_cv_tr_pred,
                    self.CT_RT[tr_id])[2]**2)
                self.r2_CT_RT_cv_test.append(linregress(self.CT_RT_cv_ts_pred, 
                    self.CT_RT[ts_id])[2]**2)
                self.pr_CT_RT_cv_train.append(pr(self.CT_RT_cv_tr_pred, 
                    self.CT_RT[tr_id]))
                self.pr_CT_RT_cv_test.append(pr(self.CT_RT_cv_ts_pred, 
                    self.CT_RT[ts_id]))
        
        self.N_dp = len(self.y)
        self.rmse_mean_train = np.mean(self.rmse_cv_train)
        self.rmse_std_train = np.std(self.rmse_cv_train)
        self.rmse_mean_test = np.mean(self.rmse_cv_test)
        self.rmse_std_test = np.std(self.rmse_cv_test)
        self.r2_mean_train = np.mean(self.r2_cv_train)
        self.r2_std_train = np.std(self.r2_cv_train)
        self.r2_mean_test = np.mean(self.r2_cv_test)
        self.r2_std_test = np.std(self.r2_cv_test)
        self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train])
        self.pr_std_train = np.std([i[0] for i in self.pr_cv_train])
        self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test])
        self.pr_std_test = np.std([i[0] for i in self.pr_cv_test])
        if self.CT_RT is not None:
            self.rmse_CT_RT_mean_train = np.mean(self.rmse_CT_RT_cv_train)
            self.rmse_CT_RT_std_train = np.std(self.rmse_CT_RT_cv_train)
            self.rmse_CT_RT_mean_test = np.mean(self.rmse_CT_RT_cv_test)
            self.rmse_CT_RT_std_test = np.std(self.rmse_CT_RT_cv_test)
            self.r2_CT_RT_mean_train = np.mean(self.r2_CT_RT_cv_train)
            self.r2_CT_RT_std_train = np.std(self.r2_CT_RT_cv_train)
            self.r2_CT_RT_mean_test = np.mean(self.r2_CT_RT_cv_test)
            self.r2_CT_RT_std_test = np.std(self.r2_CT_RT_cv_test)
            self.pr_CT_RT_mean_train = np.mean([i[0] 
                for i in self.pr_CT_RT_cv_train])
            self.pr_CT_RT_std_train = np.std([i[0] 
                for i in self.pr_CT_RT_cv_train])
            self.pr_CT_RT_mean_test = np.mean([i[0] 
                for i in self.pr_CT_RT_cv_test])
            self.pr_CT_RT_std_test = np.std([i[0] 
                for i in self.pr_CT_RT_cv_test])

        '''
ax.set_xlabel('Time (s)'), ax.set_ylabel('Real part'), ax.set_zlabel(
    'Imag part')
ax.set_title('Complex sine wave in all its 3D glory')
plt.show()

# two vectors
v1 = [1, 7, 5, 1, 4, 0, 6, 8, 1, 8]
v2 = [10, 85, 35, 15, 55, 5, 72, 81, 13, 92]

# compute the dot product
dp = sum(np.multiply(v1, v2)) / 1000

print('The dot product is', dp)

#pearson correlation
corr, dp1 = pr(v2, v2)
corr

# dot products of sine waves

# general simulation parameters
srate = 500
# sampling rate in Hz
time = np.arange(0., 2., 1. / srate)  # time in seconds

# sine wave parameters
freq1 = 5
# frequency in Hz
freq2 = 5
# frequency in Hz
Esempio n. 6
0
    def split_on_best_feature(self, dataX, dataY):
        # Resource: Primary Resource -> No. 2
        # Main Goal: Choosing the best feature to split on <- that means choosing the feature which has the highest absoluate correlation with dataY.
        # Rules:
        #       1. Splitting value will be the mean of the splitiing feature values. 
        #           1a. If all the features have same amount of values, then choose the feature which comes first. 
        #       2. If the selected best feature can not split the data, then we will choose second best feature to split on. 
        #           2a. If none of the feature can not split the data accordingly, then it that case we will return the leaf. 

        # Params:
        #   dataX : A numpy ndarray -> x values at each node
        #   dataY : A numpy 1d array -> y values at each node

        # Returns: 
        #   Tree: A mumpy ndarray.
        #       
        #                <------ feature indices (int type; index for a leaf is -1), splitting values ------>
        #                           /     ~            ~               ~              ~
        #                           |     ~            ~               ~              ~
        #               nodes       |     ~            ~               ~              ~
        #                           |     ~            ~               ~              ~
        #                           \     ~            ~               ~              ~

        if dataX.shape[0] <= self.leaf_size: 
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])
    
        # Now, lets look into the availble list of features. 
        availble_features = range(dataX.shape[1]) # Equivalent to num_features
        availble_LIST_of_features = list(availble_features)

        # Tuples: (<features>, <their_correlation_with_dataY>)
        feature_correlations = []
        feature_correlations = sorted(feature_correlations, key=lambda feature_correlations: feature_correlations[1]) # Sorting with correlations.
        # Referance for Sorting: https://docs.python.org/2.7/howto/sorting.html
         
        for ftr_itr in range(dataX.shape[1]):
            absolute_correlation_value = abs(pr(dataX[:, ftr_itr], dataY)[0])
            
            # Dropping NAN values, and assigning their correlation to 0.0 <- float number.
            if np.isnan(absolute_correlation_value):
                absolute_correlation_value = 0.0
            else:
                pass            
            
            # Now,Appending all values to features_correlaticat coons.            
            feature_correlations.append((ftr_itr, absolute_correlation_value))
        
        # Choosing the best feature. 
        # if lenth of availble total features are 0,
        #           then return leaf. 
        feature_Correlation_temp = 0   
        if len(availble_LIST_of_features) == 0:
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])
        
        #else:
        #   once again check if the features are 1 or more. 
        # Choose the best feature, if any, by iterating over feats_corrs
        else:            
            # Choose the best feature, if any, by iterating over feats_corrs
            while len(availble_LIST_of_features) -1 >= 0:
                best_feature_itr = feature_correlations[feature_Correlation_temp][0]
                y = best_feature_itr

                # Split the data according to the best feature, and considering the mean of the data. 
                # Primary Resource No. 2
                split_val = np.median(dataX[:, y])

                # Arrays for indexing - Logically 
                left_i = dataX[:, y] 
                right_i = dataX[:, y]

                left_index = left_i <= split_val
                right_index = right_i > split_val

                # In any case if we can not split ANY feature in any two distinct parts, then all we do is -> return the leaf.         
                if len(np.unique(left_index)) != 1:
                    break                
                # Once we use the feature, then we take it off from remaining best features to choose from. 
                availble_LIST_of_features.remove(y)
                feature_Correlation_temp = feature_Correlation_temp + 1            
                      
        # Once we run while loop and in any case if we run out of all features that we can split on, then in that case we just return leaf. 
        if len(availble_LIST_of_features) == 0:
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])      
        
        # Building Following:
        #       left branch
        #       the root                    
        lefttree = (self.split_on_best_feature(dataX[left_index], dataY[left_index]))

        # Set the starting row for the right subtree of the current root
        if lefttree.ndim == 1: # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ndarray.ndim.html
            righttree_start = 1 
            righttree_start = righttree_start + 1
        elif lefttree.ndim >= (1+1):
            righttree_start = (lefttree.shape[0] + 2)-1
        root = np.array([best_feature_itr, split_val, 1, righttree_start])

        return np.vstack((root, lefttree, self.split_on_best_feature(dataX[right_index], dataY[right_index])))
Esempio n. 7
0
def calc_pearson(pred, true):
    try:
        r, p_value = pr(np.asarray(pred), np.asarray(true))
    except ValueError:
        r = -1.0
    return r
#!/usr/bin/env python
# coding: utf-8

# In[173]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr as pr

data = pd.read_csv('../input/train.csv')
pr(data.Fare, data.Pclass)
plt.style.use('bmh')
plt.xlabel('Age')
plt.ylabel('Survived')
plt.title('Age vs Survival')
plt.hist(data.Age[(np.isnan(data.Age) == False)],
         bins=15,
         alpha=0.4,
         color='r',
         label='Before')
plt.hist(data.Age[(np.isnan(data.Age) == False) & (data.Survived == 1)],
         bins=15,
         alpha=0.4,
         color='b',
         label='After')
#plt.hist(data.Age[data.Age != np.NaN])
plt.legend(loc='upper right')
plt.show()

# In[181]:
Esempio n. 9
0
    def run_model(self):

        if self.cv == 'loo':
            cv = LeaveOneOut()
        else:
            cv = KFold(n_splits=self.cv, shuffle=True)

        self.rmse_cv_train = []
        self.r2_cv_train = []
        self.rmse_cv_test = []
        self.r2_cv_test = []
        self.pr_cv_train = []
        self.pr_cv_test = []
        
        est = {'lightgbm': lgb.LGBMRegressor,
               'catboost': catboost.CatBoostRegressor,
               'xgboost': xgboost.XGBRegressor}
        
        self.model = []
        self.gen_sample = []
        model = est[self.package](**self.parameters)
        for n, (tr_id, ts_id) in enumerate(cv.split(self.X)):
            print('Running Validation {} of {}'.format(n, self.cv))
            ae = AutoEncoder(arch=self.vae_arch, X=self.X[tr_id], 
                loss='xent', epochs=2000)

            ae.build_model()
            X_gen = None
            while X_gen is None:
                if self.gen_n is not None:
                    X_gen = self.validate_xgen(generated_X=
                            ae.get_random_alloy(n_samples=self.gen_n))
                if self.gen_per_direction is not None:
                    X_gen = self.validate_xgen(generated_X=
                            ae.get_linspace_alloy(n_range=(-3, 3),
                                n_sample_per_direction=self.gen_per_direction))
            self.gen_sample.append(self.scale.inverse_transform(X_gen))
            y_gen = self.scale.inverse_transform(X_gen)[:, -1]
            y_orig = self.scale.inverse_transform(self.X[tr_id])[:, -1]
            y_ts = self.scale.inverse_transform(self.X[ts_id])[:, -1]
            X_gen = X_gen[:, :-1]
            X_orig = self.X[tr_id][:, :-1]
            X_tr = np.vstack([X_orig, X_gen])
            y_tr = np.concatenate([y_orig, y_gen])
            if self.package == 'lightgbm':
                self.model.append(model.fit(X_tr, y_tr,
                    eval_set=[(self.X[ts_id][:, :-1], y_ts)],
                    eval_metric='rmse', early_stopping_rounds=20,
                    feature_name=self.feature_names))
            elif self.package == 'xgboost':
                self.model.append(model.fit(X_tr, y_tr,
                    eval_set=[(self.X[ts_id][:, :-1], y_ts)],
                    eval_metric='rmse', early_stopping_rounds=20))
            else:
                self.model.append(model.fit(X_tr, y_tr,
                    eval_set=[(self.X[ts_id][:, :-1], y_ts)],
                    early_stopping_rounds=20))
            if self.package == 'lightgbm':
                self.y_cv_tr_pred = self.model[-1].predict(X_tr,
                    num_iteration=self.model[-1].best_iteration_)
                self.y_cv_ts_pred = self.model[-1].predict(
                        self.X[ts_id][:, :-1], num_iteration=
                        self.model[-1].best_iteration_)
            else:
                self.y_cv_tr_pred = self.model[-1].predict(
                        self.X[tr_id][:, :-1])
                self.y_cv_ts_pred = self.model[-1].predict(
                        self.X[ts_id][:, :-1])
            self.y_cv_tr = y_orig
            self.y_cv_ts = y_ts
            self.rmse_cv_train.append(np.sqrt(mean_squared_error(
                self.y_cv_tr_pred, self.y_cv_tr)))
            self.rmse_cv_test.append(np.sqrt(mean_squared_error(
                self.y_cv_ts_pred, self.y_cv_ts)))
            self.r2_cv_train.append(linregress(self.y_cv_tr_pred, 
                self.y_cv_tr)[2]**2)
            self.r2_cv_test.append(linregress(self.y_cv_ts_pred, 
                self.y_cv_ts)[2]**2)
            self.pr_cv_train.append(pr(self.y_cv_tr_pred, self.y_cv_tr))
            self.pr_cv_test.append(pr(self.y_cv_ts_pred, self.y_cv_ts))
        
            self.N_dp = len(self.X[:, -1])
        self.rmse_mean_train = np.mean(self.rmse_cv_train)
        self.rmse_std_train = np.std(self.rmse_cv_train)
        self.rmse_mean_test = np.mean(self.rmse_cv_test)
        self.rmse_std_test = np.std(self.rmse_cv_test)
        self.r2_mean_train = np.mean(self.r2_cv_train)
        self.r2_std_train = np.std(self.r2_cv_train)
        self.r2_mean_test = np.mean(self.r2_cv_test)
        self.r2_std_test = np.std(self.r2_cv_test)
        self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train])
        self.pr_std_train = np.std([i[0] for i in self.pr_cv_train])
        self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test])
        self.pr_std_test = np.std([i[0] for i in self.pr_cv_test])
    def run_reg(self):

        if self.estimator == 'MLP':
            est = import_module('sklearn.neural_network')
            estimator = getattr(est, 'MLPRegressor')
        if self.estimator == 'LR':
            est = import_module('sklearn.linear_model')
            estimator = getattr(est, 'LinearRegression')
        if self.estimator == 'RF':
            est = import_module('sklearn.ensemble')
            estimator = getattr(est, 'RandomForestRegressor')

        if not self.estimator_param:
            estimator = estimator()
        else:
            estimator = estimator(**self.estimator_param)

        print('Fitting the master model. Hang tight!')
        self.model = estimator.fit(self.X, self.y)
        #Model Validation
        print('Initializing validation.')
        if self.validation == 'leave_one_out':
            val = getattr(import_module('sklearn.model_selection'),
                          'LeaveOneOut')()
        else:
            val = getattr(import_module('sklearn.model_selection'),
                          'KFold')(n_splits=int(self.validation.split('-')[0]))

        self.rmse_train = []
        self.rmse_test = []
        self.mae_train = []
        self.mae_test = []
        self.r2_train = []
        self.r2_test = []
        self.pr_train = []
        self.pr_test = []
        self.y_true_train = []
        self.y_pred_train = []
        self.y_true_test = []
        self.y_pred_test = []
        if self.CT_RT is not None:
            self.rmse_CT_RT_train = []
            self.rmse_CT_RT_test = []
            self.mae_CT_RT_train = []
            self.mae_CT_RT_test = []
            self.r2_CT_RT_train = []
            self.r2_CT_RT_test = []
            self.pr_CT_RT_train = []
            self.pr_CT_RT_test = []

        for n, (tr_id, ts_id) in enumerate(val.split(self.y)):
            print('Running validation model no. {}'.format(n + 1))
            XTR, XTS, YTR = self.X[tr_id], self.X[ts_id], self.y[tr_id]
            temp_model = estimator.fit(XTR, YTR)
            y_true = self.y[ts_id]
            y_pred = temp_model.predict(XTS)
            y_pred_train = temp_model.predict(XTR)
            self.y_true_train.extend(YTR)
            self.y_pred_train.extend(y_pred_train)
            self.y_true_test.extend(y_true)
            self.y_pred_test.extend(y_pred)
            self.rmse_train.append(
                np.sqrt(mean_squared_error(y_pred_train, YTR)))
            self.rmse_test.append(np.sqrt(mean_squared_error(y_pred, y_true)))
            self.mae_train.append(mean_absolute_error(y_pred_train, YTR))
            self.mae_test.append(mean_absolute_error(y_pred, y_true))
            self.r2_train.append(linregress(y_pred_train, YTR)[2]**2)
            self.r2_test.append(linregress(y_pred, y_true)[2]**2)
            self.pr_train.append(pr(y_pred_train.reshape(-1), YTR.reshape(-1)))
            self.pr_test.append(pr(y_pred.reshape(-1), y_true.reshape(-1)))

            if self.CT_RT is not None:
                CT_RT_train_pred = np.exp((y_pred_train * 1000 /
                                           self.CT_Temp[tr_id]) -
                                          self.C[tr_id])
                CT_RT_train_true = self.CT_RT[tr_id]
                CT_RT_test_pred = np.exp((y_pred * 1000 /
                                          self.CT_Temp[ts_id]) - self.C[ts_id])
                CT_RT_test_true = self.CT_RT[ts_id]
                self.rmse_CT_RT_train.append(
                    np.sqrt(
                        mean_squared_error(CT_RT_train_pred,
                                           CT_RT_train_true)))
                self.rmse_CT_RT_test.append(
                    np.sqrt(
                        mean_squared_error(CT_RT_test_pred, CT_RT_test_true)))
                self.mae_CT_RT_train.append(
                    mean_absolute_error(CT_RT_train_pred, CT_RT_train_true))
                self.mae_CT_RT_test.append(
                    mean_absolute_error(CT_RT_test_pred, CT_RT_test_true))
                self.r2_CT_RT_train.append(
                    linregress(CT_RT_train_pred, CT_RT_train_true)[2]**2)
                self.r2_CT_RT_test.append(
                    linregress(CT_RT_test_pred, CT_RT_test_true)[2]**2)
                self.pr_CT_RT_train.append(
                    pr(CT_RT_train_pred.reshape(-1),
                       CT_RT_train_true.reshape(-1)))
                self.pr_CT_RT_test.append(
                    pr(CT_RT_test_pred.reshape(-1),
                       CT_RT_test_true.reshape(-1)))

        self.rmse_train_mean = np.mean(self.rmse_train)
        self.rmse_train_std = np.std(self.rmse_train)
        self.mae_train_mean = np.mean(self.mae_train)
        self.mae_train_std = np.std(self.mae_train)
        self.r2_train_mean = np.mean(self.r2_train)
        self.r2_train_std = np.std(self.r2_train)
        self.pr_train_mean = np.mean(self.pr_train)
        self.pr_train_std = np.std(self.pr_train)
        if self.CT_RT is not None:
            self.rmse_CT_RT_train_mean = np.mean(self.rmse_CT_RT_train)
            self.rmse_CT_RT_train_std = np.std(self.rmse_CT_RT_train)
            self.mae_CT_RT_train_mean = np.mean(self.mae_CT_RT_train)
            self.mae_CT_RT_train_std = np.std(self.mae_CT_RT_train)
            self.r2_CT_RT_train_mean = np.mean(self.r2_CT_RT_train)
            self.r2_CT_RT_train_std = np.std(self.r2_CT_RT_train)
            self.pr_CT_RT_train_mean = np.mean(self.pr_CT_RT_train)
            self.pr_CT_RT_train_std = np.std(self.pr_CT_RT_train)

        self.rmse_test_mean = np.mean(self.rmse_test)
        self.rmse_test_std = np.std(self.rmse_test)
        self.mae_test_mean = np.mean(self.mae_test)
        self.mae_test_std = np.std(self.mae_test)
        self.r2_test_mean = np.mean(self.r2_test)
        self.r2_test_std = np.std(self.r2_test)
        self.pr_test_mean = np.mean(self.pr_test)
        self.pr_test_std = np.std(self.pr_test)
        if self.CT_RT is not None:
            self.rmse_CT_RT_test_mean = np.mean(self.rmse_CT_RT_test)
            self.rmse_CT_RT_test_std = np.std(self.rmse_CT_RT_test)
            self.mae_CT_RT_test_mean = np.mean(self.mae_CT_RT_test)
            self.mae_CT_RT_test_std = np.std(self.mae_CT_RT_test)
            self.r2_CT_RT_test_mean = np.mean(self.r2_CT_RT_test)
            self.r2_CT_RT_test_std = np.std(self.r2_CT_RT_test)
            self.pr_CT_RT_test_mean = np.mean(self.pr_CT_RT_test)
            self.pr_CT_RT_test_std = np.std(self.pr_CT_RT_test)
Esempio n. 11
0
import csv
import math
from scipy.stats import pearsonr as pr

with open('prasanna to last cub alvina 5.csv') as csf:
    csv_reader = csv.reader(csf,delimiter=',')
    x = []
    y = []
    z = []
    total = []
    for row in csv_reader:
        x.append(float(row[0]))
        y.append(float(row[1]))
        z.append(float(row[2]))        
        total.append(math.sqrt((float(row[0])**2)+(float(row[1])**2)+(float(row[2])**2)))
with open('prasanna to last cub alvina 3.csv') as csf:
    csv_reader = csv.reader(csf,delimiter=',')
    x1 = []
    y1 = []
    z1 = []
    total1 = []
    for row in csv_reader:
        x1.append(float(row[0]))
        y1.append(float(row[1]))
        z1.append(float(row[2]))        
        total1.append(math.sqrt((float(row[0])**2)+(float(row[1])**2)+(float(row[2])**2)))
print(pr(x,x1))
print(pr(y,y1))
print(pr(z,z1))
print(pr(total,total1))
import statistics as stcs
from scipy.stats import pearsonr as pr
import pandas as pd
stock = pd.read_csv("^GSPC.csv", index_col="Date")
print(stock["Close"].mean())
print(stock["Close"].std())
print(stock["Close"].skew())
print(pr(stock["Close"], stock["Volume"]))
Esempio n. 13
0
        alldata['TM'].append(tmdic[key])
        alldata['QA'].append(qadic[key])

df = pd.DataFrame(alldata)

fig, axs = plt.subplots(3, 6, sharex=True, sharey=True)
plt.xticks(np.arange(0, 1.2, 0.2))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlim(0, 1)
plt.ylim(0, 0.7)

row = col = 0
for target in targets:
    tdf = df.loc[df['TARGET'] == target]
    m, b = np.polyfit(list(tdf['TM']), list(tdf['QA']), 1)
    pcc = pr(list(tdf['TM']), list(tdf['QA']))
    x = np.arange(0, 1, 0.01)
    axs[row][col].plot(x, m * x + b)
    sb.scatterplot(x='TM', y='QA', data=tdf, s=5, ax=axs[row][col])
    axs[row][col].set_ylabel('')
    axs[row][col].set_xlabel('')
    axs[row][col].set_title('{t} - PCC:{p}'.format(t=target,
                                                   p=round(pcc[0], 3)),
                            fontsize=8)

    if col < 5: col += 1
    else:
        col = 0
        row += 1

fig.text(0.5, 0.04, 'TM score', ha='center', fontsize=12)
Esempio n. 14
0
    def run_model(self):

        est = {
            'lightgbm': lgb.LGBMRegressor,
            'catboost': catboost.CatBoostRegressor,
            'xgboost': xgboost.XGBRegressor
        }

        self.rmse_cv_train = []
        self.r2_cv_train = []
        self.rmse_cv_test = []
        self.r2_cv_test = []
        self.pr_cv_train = []
        self.pr_cv_test = []
        if self.CT_RT is not None:
            self.rmse_CT_RT_cv_train = []
            self.r2_CT_RT_cv_train = []
            self.rmse_CT_RT_cv_test = []
            self.r2_CT_RT_cv_test = []
            self.pr_CT_RT_cv_train = []
            self.pr_CT_RT_cv_test = []

        for i in np.arange(self.nrun):
            if self.CT_RT is not None:
                data = train_test_split(self.X,
                                        self.y,
                                        self.CT_RT,
                                        self.CT_Temp,
                                        self.C,
                                        test_size=self.test_size)
                Xtr, Xts, ytr, yts = data[0], data[1], data[2], data[3]
                CT_RTtr, CT_RTts, CT_Temptr = data[4], data[5], data[6]
                CT_Tempts, Ctr, Cts = data[7], data[8], data[9]
                del data
            else:
                Xtr, Xts, ytr, yts = train_test_split(self.X,
                                                      self.y,
                                                      test_size=self.test_size)
            model = est[self.package](**self.parameters)

            if self.package == 'lightgbm':
                model.fit(Xtr,
                          ytr,
                          eval_set=[(Xts, yts)],
                          eval_metric='rmse',
                          early_stopping_rounds=20,
                          feature_name=self.feature_names)
            elif self.package == 'xgboost':
                model.fit(Xtr,
                          ytr,
                          eval_set=[(Xts, yts)],
                          eval_metric='rmse',
                          early_stopping_rounds=20)
            else:
                model.fit(Xtr,
                          ytr,
                          eval_set=[(Xts, yts)],
                          early_stopping_rounds=20)
            if self.package == 'lightgbm':
                self.y_cv_tr_pred = model.predict(
                    Xtr, num_iteration=model.best_iteration_)
                self.y_cv_ts_pred = model.predict(
                    Xts, num_iteration=model.best_iteration_)
                if self.model_scheme == 'LMP':
                    self.CT_RT_cv_tr_pred = np.exp((self.y_cv_tr_pred * 1000 /
                                                    CT_Temptr) - Ctr)
                    self.CT_RT_cv_ts_pred = np.exp((self.y_cv_ts_pred * 1000 /
                                                    CT_Tempts) - Cts)
            else:
                self.y_cv_tr_pred = model.predict(Xtr)
                self.y_cv_ts_pred = model.predict(Xts)
                if self.model_scheme == 'LMP':
                    self.CT_RT_cv_tr_pred = np.exp((self.y_cv_tr_pred * 1000 /
                                                    CT_Temptr) - Ctr)
                    self.CT_RT_cv_ts_pred = np.exp((self.y_cv_ts_pred * 1000 /
                                                    CT_Tempts) - Cts)
            self.y_cv_tr = ytr
            self.y_cv_ts = yts
            if self.CT_RT is not None:
                self.CT_RT_cv_tr = CT_RTtr
                self.CT_RT_cv_ts = CT_RTts
            self.rmse_cv_train.append(
                np.sqrt(mean_squared_error(self.y_cv_tr_pred, ytr)))
            self.rmse_cv_test.append(
                np.sqrt(mean_squared_error(self.y_cv_ts_pred, yts)))
            self.r2_cv_train.append(linregress(self.y_cv_tr_pred, ytr)[2]**2)
            self.r2_cv_test.append(linregress(self.y_cv_ts_pred, yts)[2]**2)
            self.pr_cv_train.append(pr(self.y_cv_tr_pred, ytr))
            self.pr_cv_test.append(pr(self.y_cv_ts_pred, yts))
            if self.CT_RT is not None:
                self.rmse_CT_RT_cv_train.append(
                    np.sqrt(mean_squared_error(self.CT_RT_cv_tr_pred,
                                               CT_RTtr)))
                self.rmse_CT_RT_cv_test.append(
                    np.sqrt(mean_squared_error(self.CT_RT_cv_ts_pred,
                                               CT_RTts)))
                self.r2_CT_RT_cv_train.append(
                    linregress(self.CT_RT_cv_tr_pred, CT_RTtr)[2]**2)
                self.r2_CT_RT_cv_test.append(
                    linregress(self.CT_RT_cv_ts_pred, CT_RTts)[2]**2)
                self.pr_CT_RT_cv_train.append(
                    pr(self.CT_RT_cv_tr_pred, CT_RTtr))
                self.pr_CT_RT_cv_test.append(pr(self.CT_RT_cv_ts_pred,
                                                CT_RTts))

        self.N_dp = len(self.y)
        self.N_dp_train = len(ytr)
        self.N_dp_test = len(yts)
        self.rmse_mean_train = np.mean(self.rmse_cv_train)
        self.rmse_std_train = np.std(self.rmse_cv_train)
        self.rmse_mean_test = np.mean(self.rmse_cv_test)
        self.rmse_std_test = np.std(self.rmse_cv_test)
        self.r2_mean_train = np.mean(self.r2_cv_train)
        self.r2_std_train = np.std(self.r2_cv_train)
        self.r2_mean_test = np.mean(self.r2_cv_test)
        self.r2_std_test = np.std(self.r2_cv_test)
        self.pr_mean_train = np.mean([i[0] for i in self.pr_cv_train])
        self.pr_std_train = np.std([i[0] for i in self.pr_cv_train])
        self.pr_mean_test = np.mean([i[0] for i in self.pr_cv_test])
        self.pr_std_test = np.std([i[0] for i in self.pr_cv_test])
        if self.CT_RT is not None:
            self.rmse_CT_RT_mean_train = np.mean(self.rmse_CT_RT_cv_train)
            self.rmse_CT_RT_std_train = np.std(self.rmse_CT_RT_cv_train)
            self.rmse_CT_RT_mean_test = np.mean(self.rmse_CT_RT_cv_test)
            self.rmse_CT_RT_std_test = np.std(self.rmse_CT_RT_cv_test)
            self.r2_CT_RT_mean_train = np.mean(self.r2_CT_RT_cv_train)
            self.r2_CT_RT_std_train = np.std(self.r2_CT_RT_cv_train)
            self.r2_CT_RT_mean_test = np.mean(self.r2_CT_RT_cv_test)
            self.r2_CT_RT_std_test = np.std(self.r2_CT_RT_cv_test)
            self.pr_CT_RT_mean_train = np.mean(
                [i[0] for i in self.pr_CT_RT_cv_train])
            self.pr_CT_RT_std_train = np.std(
                [i[0] for i in self.pr_CT_RT_cv_train])
            self.pr_CT_RT_mean_test = np.mean(
                [i[0] for i in self.pr_CT_RT_cv_test])
            self.pr_CT_RT_std_test = np.std(
                [i[0] for i in self.pr_CT_RT_cv_test])
Esempio n. 15
0
        alldata['PQA'].append(pqadic[key])
        alldata['GQA'].append(gqadic[key])

df = pd.DataFrame(alldata)

fig, axs = plt.subplots(3, 6, sharex=True, sharey=True)
plt.xticks(np.arange(0, 1.2, 0.2))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlim(0, 1)
plt.ylim(0, 0.7)

row = col = 0
for target in targets:
    tdf = df.loc[df['TARGET'] == target]
    m, b = np.polyfit(list(tdf['TM']), list(tdf['PQA']), 1)
    pcc = pr(list(tdf['TM']), list(tdf['PQA']))
    x = np.arange(0, 1, 0.01)
    axs[row][col].plot(x, m * x + b)
    sb.scatterplot(x='TM', y='PQA', data=tdf, s=5, ax=axs[row][col])

    m, b = np.polyfit(list(tdf['TM']), list(tdf['GQA']), 1)
    pcc = pr(list(tdf['TM']), list(tdf['GQA']))
    x = np.arange(0, 1, 0.01)
    axs[row][col].plot(x, m * x + b)
    sb.scatterplot(x='TM', y='GQA', data=tdf, s=5, ax=axs[row][col])

    axs[row][col].set_ylabel('')
    axs[row][col].set_xlabel('')
    axs[row][col].set_title('{t} - PCC:{p}'.format(t=target,
                                                   p=round(pcc[0], 3)),
                            fontsize=8)